Merge remote branch 'origin/master' into radeon-rewrite

author: Dave Airlie <airlied@redhat.com> 2009-06-07 16:51:32 +1000
committer: Dave Airlie <airlied@redhat.com> 2009-06-07 16:51:32 +1000
commit: 545e574cd9a2a659cd9a93879dff8884bd247558 (patch)
tree: f56d65eaa851edfb1248a6fc8ac0bae4cc98eff5 /src/gallium/drivers
parent: e2aedfa62079ff1a333e1f4e56faea303cc36edb (diff)
parent: f1edfa09ea50e8833ddbf241da4d36fd38685e9d (diff)
93 files changed, 3665 insertions, 1404 deletions
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 7c225e2f27..5ffb7073ab 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -32,6 +32,7 @@
 
 #include "spu_main.h"
 #include "spu_render.h"
+#include "spu_shuffle.h"
 #include "spu_tri.h"
 #include "spu_tile.h"
 #include "cell/common.h"
@@ -267,15 +268,75 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
       uint drawn = 0;
 
-      /* loop over tris */
-      for (j = 0; j < render->num_indexes; j += 3) {
-         const float *v0, *v1, *v2;
-
-         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
-         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
-         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
-
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+      const qword vertex_sizes = (qword)spu_splats(vertex_size);
+      const qword verticess = (qword)spu_splats((uint)vertices);
+
+      ASSERT_ALIGN16(&indexes[0]);
+
+      const uint num_indexes = render->num_indexes;
+
+      /* loop over tris
+	   * &indexes[0] will be 16 byte aligned.  This loop is heavily unrolled
+	   * avoiding variable rotates when extracting vertex indices.
+	   */
+      for (j = 0; j < num_indexes; j += 24) {
+         /* Load three vectors, containing 24 ushort indices */
+         const qword* lower_qword = (qword*)&indexes[j];
+         const qword indices0 = lower_qword[0];
+         const qword indices1 = lower_qword[1];
+         const qword indices2 = lower_qword[2];
+
+         /* stores three indices for each tri n in slots 0, 1 and 2 of vsn */
+		 /* Straightforward rotates for these */
+         qword vs0 = indices0;
+         qword vs1 = si_shlqbyi(indices0, 6);
+         qword vs3 = si_shlqbyi(indices1, 2);
+         qword vs4 = si_shlqbyi(indices1, 8);
+         qword vs6 = si_shlqbyi(indices2, 4);
+         qword vs7 = si_shlqbyi(indices2, 10);
+
+         /* For tri 2 and 5, the three indices are split across two machine
+		  * words - rotate and combine */
+         const qword tmp2a = si_shlqbyi(indices0, 12);
+         const qword tmp2b = si_rotqmbyi(indices1, 12|16);
+         qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(si_from_uint(0x20)));
+
+         const qword tmp5a = si_shlqbyi(indices1, 14);
+         const qword tmp5b = si_rotqmbyi(indices2, 14|16);
+         qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(si_from_uint(0x60)));
+
+         /* unpack indices from halfword slots to word slots */
+         vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
+         vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
+         vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
+         vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
+         vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
+         vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
+         vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
+         vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));
+
+         /* Calculate address of vertex in vertices[] */
+         vs0 = si_mpya(vs0, vertex_sizes, verticess);
+         vs1 = si_mpya(vs1, vertex_sizes, verticess);
+         vs2 = si_mpya(vs2, vertex_sizes, verticess);
+         vs3 = si_mpya(vs3, vertex_sizes, verticess);
+         vs4 = si_mpya(vs4, vertex_sizes, verticess);
+         vs5 = si_mpya(vs5, vertex_sizes, verticess);
+         vs6 = si_mpya(vs6, vertex_sizes, verticess);
+         vs7 = si_mpya(vs7, vertex_sizes, verticess);
+
+         /* Select the appropriate call based on the number of vertices 
+		  * remaining */
+         switch(num_indexes - j) {
+            default: drawn += tri_draw(vs7, tx, ty);
+            case 21: drawn += tri_draw(vs6, tx, ty);
+            case 18: drawn += tri_draw(vs5, tx, ty);
+            case 15: drawn += tri_draw(vs4, tx, ty);
+            case 12: drawn += tri_draw(vs3, tx, ty);
+            case 9:  drawn += tri_draw(vs2, tx, ty);
+            case 6:  drawn += tri_draw(vs1, tx, ty);
+            case 3:  drawn += tri_draw(vs0, tx, ty);
+         }
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index d727268475..58be001be4 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -133,7 +133,15 @@ struct setup_stage {
 
    uint tx, ty;  /**< position of current tile (x, y) */
 
-   int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
+   union {
+      struct {
+         int cliprect_minx;
+         int cliprect_miny;
+         int cliprect_maxx;
+         int cliprect_maxy;
+      };
+      qword cliprect;
+   };
 
    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 
@@ -432,6 +440,41 @@ print_vertex(const struct vertex_header *v)
 }
 #endif
 
+/* Returns the minimum of each slot of two vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n]);
+ */
+static qword
+minfq(qword q0, qword q1)
+{
+   const qword q0q1m = si_fcgt(q0, q1);
+   return si_selb(q0, q1, q0q1m);
+}
+
+/* Returns the minimum of each slot of three vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+min3fq(qword q0, qword q1, qword q2)
+{
+   return minfq(minfq(q0, q1), q2);
+}
+
+/* Returns the maximum of each slot of two vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+maxfq(qword q0, qword q1) {
+   const qword q0q1m = si_fcgt(q0, q1);
+   return si_selb(q1, q0, q0q1m);
+}
+
+/* Returns the maximum of each slot of three vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+max3fq(qword q0, qword q1, qword q2) {
+   return maxfq(maxfq(q0, q1), q2);
+}
 
 /**
  * Sort vertices from top to bottom.
@@ -440,9 +483,7 @@ print_vertex(const struct vertex_header *v)
  * \return  FALSE if tri is totally outside tile, TRUE otherwise
  */
 static boolean
-setup_sort_vertices(const struct vertex_header *v0,
-                    const struct vertex_header *v1,
-                    const struct vertex_header *v2)
+setup_sort_vertices(const qword vs)
 {
    float area, sign;
 
@@ -455,57 +496,57 @@ setup_sort_vertices(const struct vertex_header *v0,
    }
 #endif
 
-   /* determine bottom to top order of vertices */
    {
+      /* Load the float values for various processing... */
+      const qword f0 = (qword)(((const struct vertex_header*)si_to_ptr(vs))->data[0]);
+      const qword f1 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0]);
+      const qword f2 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0]);
+
+      /* Check if triangle is completely outside the tile bounds
+       * Find the min and max x and y positions of the three poits */
+      const qword minf = min3fq(f0, f1, f2);
+      const qword maxf = max3fq(f0, f1, f2);
+
+      /* Compare min and max against cliprect vals */
+      const qword maxsmins = si_shufb(maxf, minf, SHUFB4(A,B,a,b));
+      const qword outside = si_fcgt(maxsmins, si_csflt(setup.cliprect, 0));
+
+      /* Use a little magic to work out of the tri is visible or not */
+      if(si_to_uint(si_xori(si_gb(outside), 0xc))) return FALSE;
+
+      /* determine bottom to top order of vertices */
       /* A table of shuffle patterns for putting vertex_header pointers into
          correct order.  Quite magical. */
-      const vec_uchar16 sort_order_patterns[] = {
-         SHUFFLE4(A,B,C,C),
-         SHUFFLE4(C,A,B,C),
-         SHUFFLE4(A,C,B,C),
-         SHUFFLE4(B,C,A,C),
-         SHUFFLE4(B,A,C,C),
-         SHUFFLE4(C,B,A,C) };
-
-      /* The vertex_header pointers, packed for easy shuffling later */
-      const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
+      const qword sort_order_patterns[] = {
+         SHUFB4(A,B,C,C),
+         SHUFB4(C,A,B,C),
+         SHUFB4(A,C,B,C),
+         SHUFB4(B,C,A,C),
+         SHUFB4(B,A,C,C),
+         SHUFB4(C,B,A,C) };
 
       /* Collate y values into two vectors for comparison.
          Using only one shuffle constant! ;) */
-      const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
-      const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
-      const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
-      const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
+      const qword y_02_ = si_shufb(f0, f2, SHUFB4(0,B,b,C));
+      const qword y_10_ = si_shufb(f1, f0, SHUFB4(0,B,b,C));
+      const qword y_012 = si_shufb(y_02_, f1, SHUFB4(0,B,b,C));
+      const qword y_120 = si_shufb(y_10_, f2, SHUFB4(0,B,b,C));
 
       /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
-      const vec_uint4 compare = spu_cmpgt(y_012, y_120);
+      const qword compare = si_fcgt(y_012, y_120);
       /* Compress the result of the comparison into 4 bits */
-      const vec_uint4 gather = spu_gather(compare);
+      const qword gather = si_gb(compare);
       /* Subtract one to attain the index into the LUT.  Magical. */
-      const unsigned int index = spu_extract(gather, 0) - 1;
+      const unsigned int index = si_to_uint(gather) - 1;
 
       /* Load the appropriate pattern and construct the desired vector. */
-      setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
+      setup.vertex_headers = si_shufb(vs, vs, sort_order_patterns[index]);
 
       /* Using the result of the comparison, set sign.
          Very magical. */
-      sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
+      sign = ((si_to_uint(si_cntb(gather)) == 2) ? 1.0f : -1.0f);
    }
 
-   /* Check if triangle is completely outside the tile bounds */
-   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
-      return FALSE;
-   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
-      return FALSE;
-   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
-       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
-       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
-      return FALSE;
-   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
-       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
-       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
-      return FALSE;
-
    setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
    setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
    setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
@@ -761,21 +802,19 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2,
+tri_draw(const qword vs,
          uint tx, uint ty)
 {
    setup.tx = tx;
    setup.ty = ty;
 
    /* set clipping bounds to tile bounds */
-   setup.cliprect_minx = tx * TILE_SIZE;
-   setup.cliprect_miny = ty * TILE_SIZE;
-   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
-   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
-
-   if (!setup_sort_vertices((struct vertex_header *) v0,
-                            (struct vertex_header *) v1,
-                            (struct vertex_header *) v2)) {
+   const qword clipbase = (qword)((vec_uint4){tx, ty});
+   const qword clipmin = si_mpyui(clipbase, TILE_SIZE);
+   const qword clipmax = si_ai(clipmin, TILE_SIZE);
+   setup.cliprect = si_shufb(clipmin, clipmax, SHUFB4(A,B,a,b));
+
+   if(!setup_sort_vertices(vs)) {
       return FALSE; /* totally clipped */
    }
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c9..82e3b19ad7 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const qword vs, uint tx, uint ty);
 
 
 #endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
index 54ef1c1291..9c235080a5 100644
--- a/src/gallium/drivers/nouveau/nouveau_push.h
+++ b/src/gallium/drivers/nouveau/nouveau_push.h
@@ -9,13 +9,13 @@
 
 #define OUT_RING(data) do {                                                    \
 	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	(*pc->nvws->channel->pushbuf->cur++) = (data);                         \
+	(*pc->base.channel->pushbuf->cur++) = (data);                          \
 } while(0)
 
 #define OUT_RINGp(src,size) do {                                               \
 	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	memcpy(pc->nvws->channel->pushbuf->cur, (src), (size) * 4);            \
-	pc->nvws->channel->pushbuf->cur += (size);                             \
+	memcpy(pc->base.channel->pushbuf->cur, (src), (size) * 4);             \
+	pc->base.channel->pushbuf->cur += (size);                              \
 } while(0)
 
 #define OUT_RINGf(data) do {                                                   \
@@ -26,25 +26,35 @@
 
 #define BEGIN_RING(obj,mthd,size) do {                                         \
 	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
-		pc->nvws->push_flush(pc->nvws, ((size) + 1), NULL);            \
+	struct nouveau_channel *chan = pc->base.channel;                       \
+	if (chan->pushbuf->remaining < ((size) + 1))                           \
+		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
 	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
-	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+	chan->pushbuf->remaining -= ((size) + 1);                              \
 } while(0)
 
 #define BEGIN_RING_NI(obj,mthd,size) do {                                      \
 	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
 } while(0)
 
+static inline void
+DO_FIRE_RING(struct nouveau_channel *chan, struct pipe_fence_handle **fence)
+{
+	nouveau_pushbuf_flush(chan, 0);
+	if (fence)
+		*fence = NULL;
+}
+
 #define FIRE_RING(fence) do {                                                  \
 	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	pc->nvws->push_flush(pc->nvws, 0, fence);                              \
+	DO_FIRE_RING(pc->base.channel, fence);                                 \
 } while(0)
 
 #define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
 	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	pc->nvws->push_reloc(pc->nvws, pc->nvws->channel->pushbuf->cur++,      \
-			     (bo), (data), (flags), (vor), (tor));             \
+	struct nouveau_channel *chan = pc->base.channel;                       \
+	nouveau_pushbuf_emit_reloc(chan, chan->pushbuf->cur++, nouveau_bo(bo), \
+				   (data), 0, (flags), (vor), (tor));          \
 } while(0)
 
 /* Raw data + flags depending on FB/TT buffer */
@@ -55,8 +65,8 @@
 /* FB/TT object handle */
 #define OUT_RELOCo(bo,flags) do {                                              \
 	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
-		  pc->nvws->channel->vram->handle,                             \
-		  pc->nvws->channel->gart->handle);                            \
+		  pc->base.channel->vram->handle,                              \
+		  pc->base.channel->gart->handle);                             \
 } while(0)
 
 /* Low 32-bits of offset */
@@ -72,11 +82,12 @@
 /* A reloc which'll recombine into a NV_DMA_METHOD packet header */
 #define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
 	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
-		pc->nvws->push_flush(pc->nvws->channel, ((size) + 1), NULL);   \
+	struct nouveau_channel *chan = pc->base.channel;                       \
+	if (chan->pushbuf->remaining < ((size) + 1))                           \
+		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
 	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
 		   (flags), 0, 0);                                             \
-	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+	chan->pushbuf->remaining -= ((size) + 1);                              \
 } while(0)
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
new file mode 100644
index 0000000000..832366e646
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -0,0 +1,240 @@
+#include <pipe/p_defines.h>
+#include <pipe/p_screen.h>
+#include <pipe/p_state.h>
+
+#include <util/u_memory.h>
+
+#include "nouveau/nouveau_bo.h"
+#include "nouveau_winsys.h"
+#include "nouveau_screen.h"
+
+static const char *
+nouveau_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nouveau_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static struct pipe_buffer *
+nouveau_screen_bo_skel(struct pipe_screen *pscreen, struct nouveau_bo *bo,
+		       unsigned alignment, unsigned usage, unsigned size)
+{
+	struct pipe_buffer *pb;
+	
+	pb = CALLOC(1, sizeof(struct pipe_buffer)+sizeof(struct nouveau_bo *));
+	if (!pb) {
+		nouveau_bo_ref(NULL, &bo);
+		return NULL;
+	}
+
+	pipe_reference_init(&pb->reference, 1);
+	pb->screen = pscreen;
+	pb->alignment = alignment;
+	pb->usage = usage;
+	pb->size = size;
+	*(struct nouveau_bo **)(pb + 1) = bo;
+	return pb;
+}
+
+static struct pipe_buffer *
+nouveau_screen_bo_new(struct pipe_screen *pscreen, unsigned alignment,
+		      unsigned usage, unsigned size)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct nouveau_bo *bo = NULL;
+	uint32_t flags = NOUVEAU_BO_MAP;
+	int ret;
+
+	if (usage & NOUVEAU_BUFFER_USAGE_TRANSFER)
+		flags |= NOUVEAU_BO_GART;
+	else
+	if (usage & PIPE_BUFFER_USAGE_VERTEX) {
+		if (pscreen->get_param(pscreen, NOUVEAU_CAP_HW_VTXBUF))
+			flags |= NOUVEAU_BO_GART;
+	} else
+	if (usage & PIPE_BUFFER_USAGE_INDEX) {
+		if (pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF))
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_PIXEL) {
+		if (usage & NOUVEAU_BUFFER_USAGE_TEXTURE)
+			flags |= NOUVEAU_BO_GART;
+		if (!(usage & PIPE_BUFFER_USAGE_CPU_READ_WRITE))
+			flags |= NOUVEAU_BO_VRAM;
+
+		if (dev->chipset == 0x50 || dev->chipset >= 0x80) {
+			flags |= NOUVEAU_BO_TILED;
+			if (usage & NOUVEAU_BUFFER_USAGE_ZETA)
+				flags |= NOUVEAU_BO_ZTILE;
+		}
+	}
+
+	ret = nouveau_bo_new(dev, flags, alignment, size, &bo);
+	if (ret)
+		return NULL;
+
+	return nouveau_screen_bo_skel(pscreen, bo, alignment, usage, size);
+}
+
+static struct pipe_buffer *
+nouveau_screen_bo_user(struct pipe_screen *pscreen, void *ptr, unsigned bytes)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct nouveau_bo *bo = NULL;
+	int ret;
+
+	ret = nouveau_bo_user(dev, ptr, bytes, &bo);
+	if (ret)
+		return NULL;
+
+	return nouveau_screen_bo_skel(pscreen, bo, 0, 0, bytes);
+}
+
+static inline uint32_t
+nouveau_screen_map_flags(unsigned pipe)
+{
+	uint32_t flags = 0;
+
+	if (pipe & PIPE_BUFFER_USAGE_CPU_READ)
+		flags |= NOUVEAU_BO_RD;
+	if (pipe & PIPE_BUFFER_USAGE_CPU_WRITE)
+		flags |= NOUVEAU_BO_WR;
+	if (pipe & PIPE_BUFFER_USAGE_DISCARD)
+		flags |= NOUVEAU_BO_INVAL;
+	if (pipe & PIPE_BUFFER_USAGE_DONTBLOCK)
+		flags |= NOUVEAU_BO_NOWAIT;
+	else
+	if (pipe & 0 /*PIPE_BUFFER_USAGE_UNSYNCHRONIZED*/)
+		flags |= NOUVEAU_BO_NOSYNC;
+
+	return flags;
+}
+
+static void *
+nouveau_screen_bo_map(struct pipe_screen *pscreen, struct pipe_buffer *pb,
+		      unsigned usage)
+{
+	struct nouveau_bo *bo = nouveau_bo(pb);
+	int ret;
+
+	ret = nouveau_bo_map(bo, nouveau_screen_map_flags(usage));
+	if (ret) {
+		debug_printf("map failed: %d\n", ret);
+		return NULL;
+	}
+
+	return bo->map;
+}
+
+static void *
+nouveau_screen_bo_map_range(struct pipe_screen *pscreen, struct pipe_buffer *pb,
+			    unsigned offset, unsigned length, unsigned usage)
+{
+	struct nouveau_bo *bo = nouveau_bo(pb);
+	int ret;
+
+	ret = nouveau_bo_map_range(bo, offset, length,
+				   nouveau_screen_map_flags(usage));
+	if (ret) {
+		debug_printf("map_range failed: %d\n", ret);
+		return NULL;
+	}
+
+	return (char *)bo->map - offset; /* why gallium? why? */
+}
+
+static void
+nouveau_screen_bo_map_flush(struct pipe_screen *pscreen, struct pipe_buffer *pb,
+			    unsigned offset, unsigned length)
+{
+	struct nouveau_bo *bo = nouveau_bo(pb);
+
+	nouveau_bo_map_flush(bo, offset, length);
+}
+
+static void
+nouveau_screen_bo_unmap(struct pipe_screen *pscreen, struct pipe_buffer *pb)
+{
+	struct nouveau_bo *bo = nouveau_bo(pb);
+
+	nouveau_bo_unmap(bo);
+}
+
+static void
+nouveau_screen_bo_del(struct pipe_buffer *pb)
+{
+	struct nouveau_bo *bo = nouveau_bo(pb);
+
+	nouveau_bo_ref(NULL, &bo);
+	FREE(pb);
+}
+
+static void
+nouveau_screen_fence_ref(struct pipe_screen *pscreen,
+			 struct pipe_fence_handle **ptr,
+			 struct pipe_fence_handle *pfence)
+{
+	*ptr = pfence;
+}
+
+static int
+nouveau_screen_fence_signalled(struct pipe_screen *screen,
+			       struct pipe_fence_handle *pfence,
+			       unsigned flags)
+{
+	return 0;
+}
+
+static int
+nouveau_screen_fence_finish(struct pipe_screen *screen,
+			    struct pipe_fence_handle *pfence,
+			    unsigned flags)
+{
+	return 0;
+}
+
+int
+nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
+{
+	struct pipe_screen *pscreen = &screen->base;
+	int ret;
+
+	ret = nouveau_channel_alloc(dev, 0xbeef0201, 0xbeef0202,
+				    &screen->channel);
+	if (ret)
+		return ret;
+	screen->device = dev;
+
+	pscreen->get_name = nouveau_screen_get_name;
+	pscreen->get_vendor = nouveau_screen_get_vendor;
+
+	pscreen->buffer_create = nouveau_screen_bo_new;
+	pscreen->user_buffer_create = nouveau_screen_bo_user;
+	pscreen->buffer_map = nouveau_screen_bo_map;
+	pscreen->buffer_map_range = nouveau_screen_bo_map_range;
+	pscreen->buffer_flush_mapped_range = nouveau_screen_bo_map_flush;
+	pscreen->buffer_unmap = nouveau_screen_bo_unmap;
+	pscreen->buffer_destroy = nouveau_screen_bo_del;
+
+	pscreen->fence_reference = nouveau_screen_fence_ref;
+	pscreen->fence_signalled = nouveau_screen_fence_signalled;
+	pscreen->fence_finish = nouveau_screen_fence_finish;
+
+	return 0;
+}
+
+void
+nouveau_screen_fini(struct nouveau_screen *screen)
+{
+}
+
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
new file mode 100644
index 0000000000..9968b07896
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -0,0 +1,25 @@
+#ifndef __NOUVEAU_SCREEN_H__
+#define __NOUVEAU_SCREEN_H__
+
+struct nouveau_screen {
+	struct pipe_screen base;
+	struct nouveau_device *device;
+	struct nouveau_channel *channel;
+};
+
+static inline struct nouveau_screen *
+nouveau_screen(struct pipe_screen *pscreen)
+{
+	return (struct nouveau_screen *)pscreen;
+}
+
+static inline struct nouveau_bo *
+nouveau_bo(struct pipe_buffer *pb)
+{
+	return pb ? *(struct nouveau_bo **)(pb + 1) : NULL;
+}
+
+int nouveau_screen_init(struct nouveau_screen *, struct nouveau_device *);
+void nouveau_screen_fini(struct nouveau_screen *);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
index a54820e851..b595405357 100644
--- a/src/gallium/drivers/nouveau/nouveau_stateobj.h
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -4,7 +4,7 @@
 #include "util/u_debug.h"
 
 struct nouveau_stateobj_reloc {
-	struct pipe_buffer *bo;
+	struct nouveau_bo *bo;
 
 	unsigned offset;
 	unsigned packet;
@@ -51,7 +51,7 @@ so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
         if (pipe_reference((struct pipe_reference**)pso, &ref->reference)) {
 		free(so->push);
 		for (i = 0; i < so->cur_reloc; i++)
-			pipe_buffer_reference(&so->reloc[i].bo, NULL);
+			nouveau_bo_ref(NULL, &so->reloc[i].bo);
 		free(so->reloc);
 		free(so);
 	}
@@ -81,13 +81,13 @@ so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 }
 
 static INLINE void
-so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
+so_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo,
 	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
 {
 	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
 	
 	r->bo = NULL;
-	pipe_buffer_reference(&r->bo, bo);
+	nouveau_bo_ref(bo, &r->bo);
 	r->offset = so->cur - so->push;
 	r->packet = so->cur_packet;
 	r->data = data;
@@ -107,50 +107,52 @@ so_dump(struct nouveau_stateobj *so)
 }
 
 static INLINE void
-so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+so_emit(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 {
-	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	struct nouveau_pushbuf *pb = chan->pushbuf;
 	unsigned nr, i;
 
 	nr = so->cur - so->push;
 	if (pb->remaining < nr)
-		nvws->push_flush(nvws, nr, NULL);
+		nouveau_pushbuf_flush(chan, nr);
 	pb->remaining -= nr;
 
 	memcpy(pb->cur, so->push, nr * 4);
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		nvws->push_reloc(nvws, pb->cur + r->offset, r->bo,
-				 r->data, r->flags, r->vor, r->tor);
+		nouveau_pushbuf_emit_reloc(chan, pb->cur + r->offset,
+					   r->bo, r->data, 0, r->flags,
+					   r->vor, r->tor);
 	}
 	pb->cur += nr;
 }
 
 static INLINE void
-so_emit_reloc_markers(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+so_emit_reloc_markers(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 {
-	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	struct nouveau_pushbuf *pb = chan->pushbuf;
 	unsigned i;
 
 	if (!so)
 		return;
 
 	i = so->cur_reloc << 1;
-	if (nvws->channel->pushbuf->remaining < i)
-		nvws->push_flush(nvws, i, NULL);
-	nvws->channel->pushbuf->remaining -= i;
+	if (pb->remaining < i)
+		nouveau_pushbuf_flush(chan, i);
+	pb->remaining -= i;
 
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		nvws->push_reloc(nvws, pb->cur++, r->bo, r->packet,
-				 (r->flags & (NOUVEAU_BO_VRAM |
-					      NOUVEAU_BO_GART |
-					      NOUVEAU_BO_RDWR)) |
-				 NOUVEAU_BO_DUMMY, 0, 0);
-		nvws->push_reloc(nvws, pb->cur++, r->bo, r->data,
-				 r->flags | NOUVEAU_BO_DUMMY, r->vor, r->tor);
+		nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo, r->packet, 0,
+					   (r->flags & (NOUVEAU_BO_VRAM |
+							NOUVEAU_BO_GART |
+							NOUVEAU_BO_RDWR)) |
+					   NOUVEAU_BO_DUMMY, 0, 0);
+		nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo, r->data, 0,
+					   r->flags | NOUVEAU_BO_DUMMY,
+					   r->vor, r->tor);
 	}
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index ff7dd1c51c..42c77e5e77 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -23,77 +23,38 @@
 #define NOUVEAU_BUFFER_USAGE_ZETA     (1 << 17)
 #define NOUVEAU_BUFFER_USAGE_TRANSFER (1 << 18)
 
-struct nouveau_winsys {
-	struct pipe_winsys *ws;
-
-	struct nouveau_channel *channel;
-
-	int  (*res_init)(struct nouveau_resource **heap, unsigned start,
-			 unsigned size);
-	int  (*res_alloc)(struct nouveau_resource *heap, int size, void *priv,
-			  struct nouveau_resource **);
-	void (*res_free)(struct nouveau_resource **);
-
-	int  (*push_reloc)(struct nouveau_winsys *, void *ptr,
-			   struct pipe_buffer *, uint32_t data,
-			   uint32_t flags, uint32_t vor, uint32_t tor);
-	int  (*push_flush)(struct nouveau_winsys *, unsigned size,
-			   struct pipe_fence_handle **fence);
-			       
-	int       (*grobj_alloc)(struct nouveau_winsys *, int grclass,
-				 struct nouveau_grobj **);
-	void      (*grobj_free)(struct nouveau_grobj **);
-
-	int       (*notifier_alloc)(struct nouveau_winsys *, int count,
-				    struct nouveau_notifier **);
-	void      (*notifier_free)(struct nouveau_notifier **);
-	void      (*notifier_reset)(struct nouveau_notifier *, int id);
-	uint32_t  (*notifier_status)(struct nouveau_notifier *, int id);
-	uint32_t  (*notifier_retval)(struct nouveau_notifier *, int id);
-	int       (*notifier_wait)(struct nouveau_notifier *, int id,
-				   int status, double timeout);
-
-	int (*surface_copy)(struct nouveau_winsys *, struct pipe_surface *,
-			    unsigned, unsigned, struct pipe_surface *,
-			    unsigned, unsigned, unsigned, unsigned);
-	int (*surface_fill)(struct nouveau_winsys *, struct pipe_surface *,
-			    unsigned, unsigned, unsigned, unsigned, unsigned);
-
-	struct nouveau_bo *(*get_bo)(struct pipe_buffer *);
-};
-
 extern struct pipe_screen *
-nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
 extern struct pipe_context *
 nv04_create(struct pipe_screen *, unsigned pctx_id);
 
 extern struct pipe_screen *
-nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
 extern struct pipe_context *
 nv10_create(struct pipe_screen *, unsigned pctx_id);
 
 extern struct pipe_screen *
-nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
 extern struct pipe_context *
 nv20_create(struct pipe_screen *, unsigned pctx_id);
 
 extern struct pipe_screen *
-nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
 extern struct pipe_context *
 nv30_create(struct pipe_screen *, unsigned pctx_id);
 
 extern struct pipe_screen *
-nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
 extern struct pipe_context *
 nv40_create(struct pipe_screen *, unsigned pctx_id);
 
 extern struct pipe_screen *
-nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
 extern struct pipe_context *
 nv50_create(struct pipe_screen *, unsigned pctx_id);
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
index f9f6d97426..4bbedfb4d6 100644
--- a/src/gallium/drivers/nv04/nv04_screen.c
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -1,27 +1,9 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_inlines.h"
-#include "util/u_simple_screen.h"
 
 #include "nv04_context.h"
 #include "nv04_screen.h"
 
-static const char *
-nv04_screen_get_name(struct pipe_screen *screen)
-{
-	struct nv04_screen *nv04screen = nv04_screen(screen);
-	struct nouveau_device *dev = nv04screen->nvws->channel->device;
-	static char buffer[128];
-
-	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
-}
-
-static const char *
-nv04_screen_get_vendor(struct pipe_screen *screen)
-{
-	return "nouveau";
-}
-
 static int
 nv04_screen_get_param(struct pipe_screen *screen, int param)
 {
@@ -123,10 +105,9 @@ static void
 nv04_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv04_screen *screen = nv04_screen(pscreen);
-	struct nouveau_winsys *nvws = screen->nvws;
 
-	nvws->notifier_free(&screen->sync);
-	nvws->grobj_free(&screen->fahrenheit);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->fahrenheit);
 	nv04_surface_2d_takedown(&screen->eng2d);
 
 	FREE(pscreen);
@@ -141,21 +122,38 @@ nv04_surface_buffer(struct pipe_surface *surf)
 }
 
 struct pipe_screen *
-nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv04_screen *screen = CALLOC_STRUCT(nv04_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
 	unsigned fahrenheit_class = 0, sub3d_class = 0;
-	unsigned chipset = nvws->channel->device->chipset;
 	int ret;
 
 	if (!screen)
 		return NULL;
-	screen->nvws = nvws;
+	pscreen = &screen->base.base;
+
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv04_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv04_screen_destroy;
+	pscreen->get_param = nv04_screen_get_param;
+	pscreen->get_paramf = nv04_screen_get_paramf;
+	pscreen->is_format_supported = nv04_screen_is_format_supported;
 
-	if (chipset>=0x20) {
+	nv04_screen_init_miptree_functions(pscreen);
+	nv04_screen_init_transfer_functions(pscreen);
+
+	if (dev->chipset >= 0x20) {
 		fahrenheit_class = 0;
 		sub3d_class = 0;
-	} else if (chipset>=0x10) {
+	} else if (dev->chipset >= 0x10) {
 		fahrenheit_class = NV10_DX5_TEXTURED_TRIANGLE;
 		sub3d_class = NV10_CONTEXT_SURFACES_3D;
 	} else {
@@ -164,50 +162,40 @@ nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	}
 
 	if (!fahrenheit_class) {
-		NOUVEAU_ERR("Unknown nv04 chipset: nv%02x\n", chipset);
+		NOUVEAU_ERR("Unknown nv04 chipset: nv%02x\n", dev->chipset);
 		return NULL;
 	}
 
-	/* 2D engine setup */
-	screen->eng2d = nv04_surface_2d_init(nvws);
-	screen->eng2d->buf = nv04_surface_buffer;
-
 	/* 3D object */
-	ret = nvws->grobj_alloc(nvws, fahrenheit_class, &screen->fahrenheit);
+	ret = nouveau_grobj_alloc(chan, 0xbeef0001, fahrenheit_class,
+				  &screen->fahrenheit);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return NULL;
 	}
+	BIND_RING(chan, screen->fahrenheit, 7);
 
 	/* 3D surface object */
-	ret = nvws->grobj_alloc(nvws, sub3d_class, &screen->context_surfaces_3d);
+	ret = nouveau_grobj_alloc(chan, 0xbeef0002, sub3d_class,
+				  &screen->context_surfaces_3d);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D surface object: %d\n", ret);
 		return NULL;
 	}
+	BIND_RING(chan, screen->context_surfaces_3d, 6);
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nv04_surface_buffer;
 
 	/* Notifier for sync purposes */
-	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv04_screen_destroy(&screen->pipe);
+		nv04_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	screen->pipe.winsys = ws;
-	screen->pipe.destroy = nv04_screen_destroy;
-
-	screen->pipe.get_name = nv04_screen_get_name;
-	screen->pipe.get_vendor = nv04_screen_get_vendor;
-	screen->pipe.get_param = nv04_screen_get_param;
-	screen->pipe.get_paramf = nv04_screen_get_paramf;
-
-	screen->pipe.is_format_supported = nv04_screen_is_format_supported;
-
-	nv04_screen_init_miptree_functions(&screen->pipe);
-	nv04_screen_init_transfer_functions(&screen->pipe);
-	u_simple_screen_init(&screen->pipe);
-
-	return &screen->pipe;
+	return pscreen;
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_screen.h b/src/gallium/drivers/nv04/nv04_screen.h
index ee6fb6db44..11466b9442 100644
--- a/src/gallium/drivers/nv04/nv04_screen.h
+++ b/src/gallium/drivers/nv04/nv04_screen.h
@@ -1,11 +1,11 @@
 #ifndef __NV04_SCREEN_H__
 #define __NV04_SCREEN_H__
 
-#include "pipe/p_screen.h"
+#include "nouveau/nouveau_screen.h"
 #include "nv04_surface_2d.h"
 
 struct nv04_screen {
-	struct pipe_screen pipe;
+	struct nouveau_screen base;
 
 	struct nouveau_winsys *nvws;
 	unsigned chipset;
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
index 87c635f962..d356ebd8b3 100644
--- a/src/gallium/drivers/nv04/nv04_state.c
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -2,6 +2,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_inlines.h"
 
 #include "tgsi/tgsi_parse.h"
 
@@ -334,7 +335,7 @@ nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 			 const struct pipe_constant_buffer *buf )
 {
 	struct nv04_context *nv04 = nv04_context(pipe);
-	struct pipe_winsys *ws = pipe->winsys;
+	struct pipe_screen *pscreen = pipe->screen;
 
 	assert(shader < PIPE_SHADER_TYPES);
 	assert(index == 0);
@@ -342,12 +343,12 @@ nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 	if (buf) {
 		void *mapped;
 		if (buf->buffer && buf->buffer->size &&
-                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
 			memcpy(nv04->constbuf[shader], mapped, buf->buffer->size);
 			nv04->constbuf_nr[shader] =
 				buf->buffer->size / (4 * sizeof(float));
-			ws->buffer_unmap(ws, buf->buffer);
+			pipe_buffer_unmap(pscreen, buf->buffer);
 		}
 	}
 }
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index f3a8d7efee..5afd028ddd 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -4,6 +4,7 @@
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_util.h"
+#include "nouveau/nouveau_screen.h"
 #include "nv04_surface_2d.h"
 
 static INLINE int
@@ -96,11 +97,11 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 			  struct pipe_surface *src, int sx, int sy,
 			  int w, int h)
 {
-	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_channel *chan = ctx->swzsurf->channel;
 	struct nouveau_grobj *swzsurf = ctx->swzsurf;
 	struct nouveau_grobj *sifm = ctx->sifm;
-	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
-	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
 	const unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
 	const unsigned max_w = 1024;
 	const unsigned max_h = 1024;
@@ -167,10 +168,10 @@ nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
 		       struct pipe_surface *dst, int dx, int dy,
 		       struct pipe_surface *src, int sx, int sy, int w, int h)
 {
-	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_channel *chan = ctx->m2mf->channel;
 	struct nouveau_grobj *m2mf = ctx->m2mf;
-	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
-	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
 	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
 	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
 	unsigned dst_offset = dst->offset + dy * dst_pitch +
@@ -213,11 +214,11 @@ nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
 		       int w, int h)
 {
-	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_channel *chan = ctx->surf2d->channel;
 	struct nouveau_grobj *surf2d = ctx->surf2d;
 	struct nouveau_grobj *blit = ctx->blit;
-	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
-	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
 	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
 	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
 	int format;
@@ -279,10 +280,10 @@ static void
 nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 		  int dx, int dy, int w, int h, unsigned value)
 {
-	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_channel *chan = ctx->surf2d->channel;
 	struct nouveau_grobj *surf2d = ctx->surf2d;
 	struct nouveau_grobj *rect = ctx->rect;
-	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
 	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
 	int cs2d_format, gdirect_format;
 
@@ -334,10 +335,10 @@ nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
 }
 
 struct nv04_surface_2d *
-nv04_surface_2d_init(struct nouveau_winsys *nvws)
+nv04_surface_2d_init(struct nouveau_screen *screen)
 {
 	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
-	struct nouveau_channel *chan = nvws->channel;
+	struct nouveau_channel *chan = screen->channel;
 	unsigned handle = 0x88000000, class;
 	int ret;
 
@@ -460,7 +461,6 @@ nv04_surface_2d_init(struct nouveau_winsys *nvws)
 		return NULL;
 	}
 
-	ctx->nvws = nvws;
 	ctx->copy = nv04_surface_copy;
 	ctx->fill = nv04_surface_fill;
 	return ctx;
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.h b/src/gallium/drivers/nv04/nv04_surface_2d.h
index 82ce7189c8..02b3f56ba8 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.h
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.h
@@ -7,7 +7,6 @@ struct nv04_surface {
 };
 
 struct nv04_surface_2d {
-	struct nouveau_winsys *nvws;
 	struct nouveau_notifier *ntfy;
 	struct nouveau_grobj *surf2d;
 	struct nouveau_grobj *swzsurf;
@@ -26,7 +25,7 @@ struct nv04_surface_2d {
 };
 
 struct nv04_surface_2d *
-nv04_surface_2d_init(struct nouveau_winsys *nvws);
+nv04_surface_2d_init(struct nouveau_screen *screen);
 
 void
 nv04_surface_2d_takedown(struct nv04_surface_2d **);
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
index d21a0e34f7..e3167814f2 100644
--- a/src/gallium/drivers/nv04/nv04_vbo.c
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -1,6 +1,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "nv04_context.h"
 #include "nv04_state.h"
@@ -13,6 +14,7 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
 {
+	struct pipe_screen *pscreen = pipe->screen;
 	struct nv04_context *nv04 = nv04_context( pipe );
 	struct draw_context *draw = nv04->draw;
 	unsigned i;
@@ -25,17 +27,17 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
 	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 		if (nv04->vtxbuf[i].buffer) {
 			void *buf
-				= pipe->winsys->buffer_map(pipe->winsys,
-						nv04->vtxbuf[i].buffer,
-						PIPE_BUFFER_USAGE_CPU_READ);
+				= pipe_buffer_map(pscreen,
+						  nv04->vtxbuf[i].buffer,
+						  PIPE_BUFFER_USAGE_CPU_READ);
 			draw_set_mapped_vertex_buffer(draw, i, buf);
 		}
 	}
 	/* Map index buffer, if present */
 	if (indexBuffer) {
 		void *mapped_indexes
-			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
-					PIPE_BUFFER_USAGE_CPU_READ);
+			= pipe_buffer_map(pscreen, indexBuffer,
+					  PIPE_BUFFER_USAGE_CPU_READ);
 		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
 	}
 	else {
@@ -55,12 +57,12 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
 	 */
 	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 		if (nv04->vtxbuf[i].buffer) {
-			pipe->winsys->buffer_unmap(pipe->winsys, nv04->vtxbuf[i].buffer);
+			pipe_buffer_unmap(pscreen, nv04->vtxbuf[i].buffer);
 			draw_set_mapped_vertex_buffer(draw, i, NULL);
 		}
 	}
 	if (indexBuffer) {
-		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
 
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
index 3da8d2f568..a127b134ec 100644
--- a/src/gallium/drivers/nv10/nv10_context.c
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -30,18 +30,18 @@ nv10_destroy(struct pipe_context *pipe)
 static void nv10_init_hwctx(struct nv10_context *nv10)
 {
 	struct nv10_screen *screen = nv10->screen;
-	struct nouveau_winsys *nvws = screen->nvws;
+	struct nouveau_channel *chan = screen->base.channel;
 	int i;
 	float projectionmatrix[16];
 
 	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
 	OUT_RING  (screen->sync->handle);
 	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->gart->handle);
+	OUT_RING  (chan->vram->handle);
+	OUT_RING  (chan->gart->handle);
 	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (chan->vram->handle);
+	OUT_RING  (chan->vram->handle);
 
 	BEGIN_RING(celsius, NV10TCL_NOP, 1);
 	OUT_RING  (0);
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
index 089c236302..1806d5f8cc 100644
--- a/src/gallium/drivers/nv10/nv10_prim_vbuf.c
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -40,7 +40,6 @@
 
 #include "util/u_debug.h"
 #include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
 
 #include "nv10_context.h"
 #include "nv10_state.h"
@@ -124,11 +123,10 @@ nv10_vbuf_render_map_vertices( struct vbuf_render *render )
 {
 	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
 	struct nv10_context *nv10 = nv10_render->nv10;
-	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	struct pipe_screen *pscreen = nv10->pipe.screen;
 
-	return winsys->buffer_map(winsys, 
-			nv10_render->buffer, 
-			PIPE_BUFFER_USAGE_CPU_WRITE);
+	return pipe_buffer_map(pscreen, nv10_render->buffer,
+			       PIPE_BUFFER_USAGE_CPU_WRITE);
 }
 
 static void
@@ -138,10 +136,10 @@ nv10_vbuf_render_unmap_vertices( struct vbuf_render *render,
 {
 	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
 	struct nv10_context *nv10 = nv10_render->nv10;
-	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	struct pipe_screen *pscreen = nv10->pipe.screen;
 
 	assert(!nv10_render->buffer);
-	winsys->buffer_unmap(winsys, nv10_render->buffer);
+	pipe_buffer_unmap(pscreen, nv10_render->buffer);
 }
 
 static boolean
@@ -202,8 +200,6 @@ static void
 nv10_vbuf_render_release_vertices( struct vbuf_render *render )
 {
 	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
-	struct nv10_context *nv10 = nv10_render->nv10;
-	struct pipe_screen *pscreen = &nv10->screen->pipe;
 
 	assert(nv10_render->buffer);
 	pipe_buffer_reference(&nv10_render->buffer, NULL);
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index 6532a93c7b..b03c291f9d 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -1,26 +1,8 @@
 #include "pipe/p_screen.h"
-#include "util/u_simple_screen.h"
 
 #include "nv10_context.h"
 #include "nv10_screen.h"
 
-static const char *
-nv10_screen_get_name(struct pipe_screen *screen)
-{
-	struct nv10_screen *nv10screen = nv10_screen(screen);
-	struct nouveau_device *dev = nv10screen->nvws->channel->device;
-	static char buffer[128];
-
-	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
-}
-
-static const char *
-nv10_screen_get_vendor(struct pipe_screen *screen)
-{
-	return "nouveau";
-}
-
 static int
 nv10_screen_get_param(struct pipe_screen *screen, int param)
 {
@@ -120,10 +102,9 @@ static void
 nv10_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv10_screen *screen = nv10_screen(pscreen);
-	struct nouveau_winsys *nvws = screen->nvws;
 
-	nvws->notifier_free(&screen->sync);
-	nvws->grobj_free(&screen->celsius);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->celsius);
 
 	FREE(pscreen);
 }
@@ -137,64 +118,69 @@ nv10_surface_buffer(struct pipe_surface *surf)
 }
 
 struct pipe_screen *
-nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv10_screen *screen = CALLOC_STRUCT(nv10_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
 	unsigned celsius_class;
-	unsigned chipset = nvws->channel->device->chipset;
 	int ret;
 
 	if (!screen)
 		return NULL;
-	screen->nvws = nvws;
+	pscreen = &screen->base.base;
 
-	/* 2D engine setup */
-	screen->eng2d = nv04_surface_2d_init(nvws);
-	screen->eng2d->buf = nv10_surface_buffer;
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv10_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv10_screen_destroy;
+	pscreen->get_param = nv10_screen_get_param;
+	pscreen->get_paramf = nv10_screen_get_paramf;
+	pscreen->is_format_supported = nv10_screen_is_format_supported;
+
+	nv10_screen_init_miptree_functions(pscreen);
+	nv10_screen_init_transfer_functions(pscreen);
 
 	/* 3D object */
-	if (chipset>=0x20)
-		celsius_class=NV11TCL;
-	else if (chipset>=0x17)
-		celsius_class=NV17TCL;
-	else if (chipset>=0x11)
-		celsius_class=NV11TCL;
+	if (dev->chipset >= 0x20)
+		celsius_class = NV11TCL;
+	else if (dev->chipset >= 0x17)
+		celsius_class = NV17TCL;
+	else if (dev->chipset >= 0x11)
+		celsius_class = NV11TCL;
 	else
-		celsius_class=NV10TCL;
+		celsius_class = NV10TCL;
 
 	if (!celsius_class) {
-		NOUVEAU_ERR("Unknown nv1x chipset: nv%02x\n", chipset);
+		NOUVEAU_ERR("Unknown nv1x chipset: nv%02x\n", dev->chipset);
 		return NULL;
 	}
 
-	ret = nvws->grobj_alloc(nvws, celsius_class, &screen->celsius);
+	ret = nouveau_grobj_alloc(chan, 0xbeef0001, celsius_class,
+				  &screen->celsius);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
+	BIND_RING(chan, screen->celsius, 7);
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nv10_surface_buffer;
 
 	/* Notifier for sync purposes */
-	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv10_screen_destroy(&screen->pipe);
+		nv10_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	screen->pipe.winsys = ws;
-	screen->pipe.destroy = nv10_screen_destroy;
-
-	screen->pipe.get_name = nv10_screen_get_name;
-	screen->pipe.get_vendor = nv10_screen_get_vendor;
-	screen->pipe.get_param = nv10_screen_get_param;
-	screen->pipe.get_paramf = nv10_screen_get_paramf;
-
-	screen->pipe.is_format_supported = nv10_screen_is_format_supported;
-
-	nv10_screen_init_miptree_functions(&screen->pipe);
-	nv10_screen_init_transfer_functions(&screen->pipe);
-	u_simple_screen_init(&screen->pipe);
-
-	return &screen->pipe;
+	return pscreen;
 }
 
diff --git a/src/gallium/drivers/nv10/nv10_screen.h b/src/gallium/drivers/nv10/nv10_screen.h
index ad829ee3fd..86b6d8def5 100644
--- a/src/gallium/drivers/nv10/nv10_screen.h
+++ b/src/gallium/drivers/nv10/nv10_screen.h
@@ -1,11 +1,11 @@
 #ifndef __NV10_SCREEN_H__
 #define __NV10_SCREEN_H__
 
-#include "pipe/p_screen.h"
+#include "nouveau/nouveau_screen.h"
 #include "nv04/nv04_surface_2d.h"
 
 struct nv10_screen {
-	struct pipe_screen pipe;
+	struct nouveau_screen base;
 
 	struct nouveau_winsys *nvws;
 
diff --git a/src/gallium/drivers/nv10/nv10_state.c b/src/gallium/drivers/nv10/nv10_state.c
index 119af66dfd..9b38219b99 100644
--- a/src/gallium/drivers/nv10/nv10_state.c
+++ b/src/gallium/drivers/nv10/nv10_state.c
@@ -2,6 +2,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_inlines.h"
 
 #include "tgsi/tgsi_parse.h"
 
@@ -460,7 +461,7 @@ nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 			 const struct pipe_constant_buffer *buf )
 {
 	struct nv10_context *nv10 = nv10_context(pipe);
-	struct pipe_winsys *ws = pipe->winsys;
+	struct pipe_screen *pscreen = pipe->screen;
 
 	assert(shader < PIPE_SHADER_TYPES);
 	assert(index == 0);
@@ -468,12 +469,12 @@ nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 	if (buf) {
 		void *mapped;
 		if (buf->buffer && buf->buffer->size &&
-                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
 			memcpy(nv10->constbuf[shader], mapped, buf->buffer->size);
 			nv10->constbuf_nr[shader] =
 				buf->buffer->size / (4 * sizeof(float));
-			ws->buffer_unmap(ws, buf->buffer);
+			pipe_buffer_unmap(pscreen, buf->buffer);
 		}
 	}
 }
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
index d0e788ac03..441a4f75f3 100644
--- a/src/gallium/drivers/nv10/nv10_vbo.c
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -1,6 +1,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "nv10_context.h"
 #include "nv10_state.h"
@@ -15,6 +16,7 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 {
 	struct nv10_context *nv10 = nv10_context( pipe );
 	struct draw_context *draw = nv10->draw;
+	struct pipe_screen *pscreen = pipe->screen;
 	unsigned i;
 
 	nv10_emit_hw_state(nv10);
@@ -24,9 +26,8 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 	 */
 	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 		if (nv10->vtxbuf[i].buffer) {
-			void *buf
-				= pipe->winsys->buffer_map(pipe->winsys,
-						nv10->vtxbuf[i].buffer,
+			void *buf =
+				pipe_buffer_map(pscreen, nv10->vtxbuf[i].buffer,
 						PIPE_BUFFER_USAGE_CPU_READ);
 			draw_set_mapped_vertex_buffer(draw, i, buf);
 		}
@@ -34,8 +35,8 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 	/* Map index buffer, if present */
 	if (indexBuffer) {
 		void *mapped_indexes
-			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
-					PIPE_BUFFER_USAGE_CPU_READ);
+			= pipe_buffer_map(pscreen, indexBuffer,
+					  PIPE_BUFFER_USAGE_CPU_READ);
 		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
 	}
 	else {
@@ -55,12 +56,12 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 	 */
 	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 		if (nv10->vtxbuf[i].buffer) {
-			pipe->winsys->buffer_unmap(pipe->winsys, nv10->vtxbuf[i].buffer);
+			pipe_buffer_unmap(pscreen, nv10->vtxbuf[i].buffer);
 			draw_set_mapped_vertex_buffer(draw, i, NULL);
 		}
 	}
 	if (indexBuffer) {
-		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
 
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
index cbc41707d5..b32d0d83ba 100644
--- a/src/gallium/drivers/nv20/nv20_context.c
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -30,7 +30,7 @@ nv20_destroy(struct pipe_context *pipe)
 static void nv20_init_hwctx(struct nv20_context *nv20)
 {
 	struct nv20_screen *screen = nv20->screen;
-	struct nouveau_winsys *nvws = screen->nvws;
+	struct nouveau_channel *chan = screen->base.channel;
 	int i;
 	float projectionmatrix[16];
 	const boolean is_nv25tcl = (nv20->screen->kelvin->grclass == NV25TCL);
@@ -38,11 +38,11 @@ static void nv20_init_hwctx(struct nv20_context *nv20)
 	BEGIN_RING(kelvin, NV20TCL_DMA_NOTIFY, 1);
 	OUT_RING  (screen->sync->handle);
 	BEGIN_RING(kelvin, NV20TCL_DMA_TEXTURE0, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->gart->handle); /* TEXTURE1 */
+	OUT_RING  (chan->vram->handle);
+	OUT_RING  (chan->gart->handle); /* TEXTURE1 */
 	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->vram->handle); /* ZETA */
+	OUT_RING  (chan->vram->handle);
+	OUT_RING  (chan->vram->handle); /* ZETA */
 
 	BEGIN_RING(kelvin, NV20TCL_DMA_QUERY, 1);
 	OUT_RING  (0); /* renouveau: beef0351, unique */
@@ -99,9 +99,9 @@ static void nv20_init_hwctx(struct nv20_context *nv20)
 		OUT_RING  (3);
 
 		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
-		OUT_RING  (nvws->channel->vram->handle);
+		OUT_RING  (chan->vram->handle);
 		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
-		OUT_RING  (nvws->channel->vram->handle);
+		OUT_RING  (chan->vram->handle);
 	}
 	BEGIN_RING(kelvin, NV20TCL_DMA_FENCE, 1);
 	OUT_RING  (0);	/* renouveau: beef1e10 */
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
index 8aa342cd2d..ddfcdb8057 100644
--- a/src/gallium/drivers/nv20/nv20_prim_vbuf.c
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -152,12 +152,11 @@ static void *
 nv20_vbuf_render_map_vertices( struct vbuf_render *render )
 {
 	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
-	struct pipe_winsys *winsys = nv20_render->nv20->pipe.winsys;
+	struct pipe_screen *pscreen = nv20_render->nv20->pipe.screen;
 
 	if (nv20_render->pbuffer) {
-		return winsys->buffer_map(winsys,
-				nv20_render->pbuffer,
-				PIPE_BUFFER_USAGE_CPU_WRITE);
+		return pipe_buffer_map(pscreen, nv20_render->pbuffer,
+				       PIPE_BUFFER_USAGE_CPU_WRITE);
 	} else if (nv20_render->mbuffer) {
 		return nv20_render->mbuffer;
 	} else
@@ -173,10 +172,10 @@ nv20_vbuf_render_unmap_vertices( struct vbuf_render *render,
 		ushort max_index )
 {
 	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
-	struct pipe_winsys *winsys = nv20_render->nv20->pipe.winsys;
+	struct pipe_screen *pscreen = nv20_render->nv20->pipe.screen;
 
 	if (nv20_render->pbuffer)
-		winsys->buffer_unmap(winsys, nv20_render->pbuffer);
+		pipe_buffer_unmap(pscreen, nv20_render->pbuffer);
 }
 
 static boolean
@@ -358,7 +357,6 @@ nv20_vbuf_render_release_vertices( struct vbuf_render *render )
 {
 	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
 	struct nv20_context *nv20 = nv20_render->nv20;
-	struct pipe_screen *pscreen = &nv20->screen->pipe;
 
 	if (nv20_render->pbuffer) {
 		pipe_buffer_reference(&nv20_render->pbuffer, NULL);
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index 7760ae27c0..024356ca74 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -1,26 +1,8 @@
 #include "pipe/p_screen.h"
-#include "util/u_simple_screen.h"
 
 #include "nv20_context.h"
 #include "nv20_screen.h"
 
-static const char *
-nv20_screen_get_name(struct pipe_screen *screen)
-{
-	struct nv20_screen *nv20screen = nv20_screen(screen);
-	struct nouveau_device *dev = nv20screen->nvws->channel->device;
-	static char buffer[128];
-
-	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
-}
-
-static const char *
-nv20_screen_get_vendor(struct pipe_screen *screen)
-{
-	return "nouveau";
-}
-
 static int
 nv20_screen_get_param(struct pipe_screen *screen, int param)
 {
@@ -120,10 +102,9 @@ static void
 nv20_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv20_screen *screen = nv20_screen(pscreen);
-	struct nouveau_winsys *nvws = screen->nvws;
 
-	nvws->notifier_free(&screen->sync);
-	nvws->grobj_free(&screen->kelvin);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->kelvin);
 
 	FREE(pscreen);
 }
@@ -137,60 +118,65 @@ nv20_surface_buffer(struct pipe_surface *surf)
 }
 
 struct pipe_screen *
-nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv20_screen *screen = CALLOC_STRUCT(nv20_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
 	unsigned kelvin_class = 0;
-	unsigned chipset = nvws->channel->device->chipset;
 	int ret;
 
 	if (!screen)
 		return NULL;
-	screen->nvws = nvws;
+	pscreen = &screen->base.base;
 
-	/* 2D engine setup */
-	screen->eng2d = nv04_surface_2d_init(nvws);
-	screen->eng2d->buf = nv20_surface_buffer;
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv20_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv20_screen_destroy;
+	pscreen->get_param = nv20_screen_get_param;
+	pscreen->get_paramf = nv20_screen_get_paramf;
+	pscreen->is_format_supported = nv20_screen_is_format_supported;
+
+	nv20_screen_init_miptree_functions(pscreen);
+	nv20_screen_init_transfer_functions(pscreen);
 
 	/* 3D object */
-	if (chipset >= 0x25)
+	if (dev->chipset >= 0x25)
 		kelvin_class = NV25TCL;
-	else if (chipset >= 0x20)
+	else if (dev->chipset >= 0x20)
 		kelvin_class = NV20TCL;
 
-	if (!kelvin_class || chipset >= 0x30) {
-		NOUVEAU_ERR("Unknown nv2x chipset: nv%02x\n", chipset);
+	if (!kelvin_class || dev->chipset >= 0x30) {
+		NOUVEAU_ERR("Unknown nv2x chipset: nv%02x\n", dev->chipset);
 		return NULL;
 	}
 
-	ret = nvws->grobj_alloc(nvws, kelvin_class, &screen->kelvin);
+	ret = nouveau_grobj_alloc(chan, 0xbeef0097, kelvin_class,
+				  &screen->kelvin);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
+	BIND_RING(chan, screen->kelvin, 7);
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nv20_surface_buffer;
 
 	/* Notifier for sync purposes */
-	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv20_screen_destroy(&screen->pipe);
+		nv20_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	screen->pipe.winsys = ws;
-	screen->pipe.destroy = nv20_screen_destroy;
-
-	screen->pipe.get_name = nv20_screen_get_name;
-	screen->pipe.get_vendor = nv20_screen_get_vendor;
-	screen->pipe.get_param = nv20_screen_get_param;
-	screen->pipe.get_paramf = nv20_screen_get_paramf;
-
-	screen->pipe.is_format_supported = nv20_screen_is_format_supported;
-
-	nv20_screen_init_miptree_functions(&screen->pipe);
-	nv20_screen_init_transfer_functions(&screen->pipe);
-	u_simple_screen_init(&screen->pipe);
-
-	return &screen->pipe;
+	return pscreen;
 }
 
diff --git a/src/gallium/drivers/nv20/nv20_screen.h b/src/gallium/drivers/nv20/nv20_screen.h
index d9fce2bced..fc7bb05033 100644
--- a/src/gallium/drivers/nv20/nv20_screen.h
+++ b/src/gallium/drivers/nv20/nv20_screen.h
@@ -1,11 +1,11 @@
 #ifndef __NV20_SCREEN_H__
 #define __NV20_SCREEN_H__
 
-#include "pipe/p_screen.h"
+#include "nouveau/nouveau_screen.h"
 #include "nv04/nv04_surface_2d.h"
 
 struct nv20_screen {
-	struct pipe_screen pipe;
+	struct nouveau_screen base;
 
 	struct nouveau_winsys *nvws;
 
diff --git a/src/gallium/drivers/nv20/nv20_state.c b/src/gallium/drivers/nv20/nv20_state.c
index ecec4f49a0..ed4084980f 100644
--- a/src/gallium/drivers/nv20/nv20_state.c
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -2,6 +2,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_inlines.h"
 
 #include "tgsi/tgsi_parse.h"
 
@@ -453,7 +454,7 @@ nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 			 const struct pipe_constant_buffer *buf )
 {
 	struct nv20_context *nv20 = nv20_context(pipe);
-	struct pipe_winsys *ws = pipe->winsys;
+	struct pipe_screen *pscreen = pipe->screen;
 
 	assert(shader < PIPE_SHADER_TYPES);
 	assert(index == 0);
@@ -461,12 +462,12 @@ nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 	if (buf) {
 		void *mapped;
 		if (buf->buffer && buf->buffer->size &&
-                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
 			memcpy(nv20->constbuf[shader], mapped, buf->buffer->size);
 			nv20->constbuf_nr[shader] =
 				buf->buffer->size / (4 * sizeof(float));
-			ws->buffer_unmap(ws, buf->buffer);
+			pipe_buffer_unmap(pscreen, buf->buffer);
 		}
 	}
 }
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
index 24d8f4bef0..84d7db6c5e 100644
--- a/src/gallium/drivers/nv20/nv20_vbo.c
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -1,6 +1,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "nv20_context.h"
 #include "nv20_state.h"
@@ -13,6 +14,7 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
 {
+	struct pipe_screen *pscreen = pipe->screen;
 	struct nv20_context *nv20 = nv20_context( pipe );
 	struct draw_context *draw = nv20->draw;
 	unsigned i;
@@ -25,17 +27,17 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
 	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 		if (nv20->vtxbuf[i].buffer) {
 			void *buf
-				= pipe->winsys->buffer_map(pipe->winsys,
-						nv20->vtxbuf[i].buffer,
-						PIPE_BUFFER_USAGE_CPU_READ);
+				= pipe_buffer_map(pscreen,
+						  nv20->vtxbuf[i].buffer,
+						  PIPE_BUFFER_USAGE_CPU_READ);
 			draw_set_mapped_vertex_buffer(draw, i, buf);
 		}
 	}
 	/* Map index buffer, if present */
 	if (indexBuffer) {
 		void *mapped_indexes
-			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
-					PIPE_BUFFER_USAGE_CPU_READ);
+			= pipe_buffer_map(pscreen, indexBuffer,
+					  PIPE_BUFFER_USAGE_CPU_READ);
 		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
 	}
 	else {
@@ -55,12 +57,12 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
 	 */
 	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 		if (nv20->vtxbuf[i].buffer) {
-			pipe->winsys->buffer_unmap(pipe->winsys, nv20->vtxbuf[i].buffer);
+			pipe_buffer_unmap(pscreen, nv20->vtxbuf[i].buffer);
 			draw_set_mapped_vertex_buffer(draw, i, NULL);
 		}
 	}
 	if (indexBuffer) {
-		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
 
diff --git a/src/gallium/drivers/nv20/nv20_vertprog.c b/src/gallium/drivers/nv20/nv20_vertprog.c
index 5db0e807ff..c1e588902b 100644
--- a/src/gallium/drivers/nv20/nv20_vertprog.c
+++ b/src/gallium/drivers/nv20/nv20_vertprog.c
@@ -1,6 +1,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
@@ -645,8 +646,8 @@ out_err:
 static boolean
 nv20_vertprog_validate(struct nv20_context *nv20)
 { 
+	struct pipe_screen *pscreen = nv20->pipe.screen;
 	struct nouveau_winsys *nvws = nv20->nvws;
-	struct pipe_winsys *ws = nv20->pipe.winsys;
 	struct nouveau_grobj *rankine = nv20->screen->rankine;
 	struct nv20_vertex_program *vp;
 	struct pipe_buffer *constbuf;
@@ -749,8 +750,8 @@ nv20_vertprog_validate(struct nv20_context *nv20)
 		float *map = NULL;
 
 		if (constbuf) {
-			map = ws->buffer_map(ws, constbuf,
-					     PIPE_BUFFER_USAGE_CPU_READ);
+			map = pipe_buffer_map(pscreen, constbuf,
+					      PIPE_BUFFER_USAGE_CPU_READ);
 		}
 
 		for (i = 0; i < vp->nr_consts; i++) {
@@ -770,9 +771,8 @@ nv20_vertprog_validate(struct nv20_context *nv20)
 			OUT_RINGp ((uint32_t *)vpd->value, 4);
 		}
 
-		if (constbuf) {
-			ws->buffer_unmap(ws, constbuf);
-		}
+		if (constbuf)
+			pipe_buffer_unmap(pscreen, constbuf);
 	}
 
 	/* Upload vtxprog */
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
index bdfe1425d2..1d1c556fb1 100644
--- a/src/gallium/drivers/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -1,6 +1,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
@@ -798,12 +799,12 @@ static void
 nv30_fragprog_upload(struct nv30_context *nv30,
 		     struct nv30_fragment_program *fp)
 {
-	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct pipe_screen *pscreen = nv30->pipe.screen;
 	const uint32_t le = 1;
 	uint32_t *map;
 	int i;
 
-	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+	map = pipe_buffer_map(pscreen, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
 
 #if 0
 	for (i = 0; i < fp->insn_len; i++) {
@@ -825,7 +826,7 @@ nv30_fragprog_upload(struct nv30_context *nv30,
 		}
 	}
 
-	ws->buffer_unmap(ws, fp->buffer);
+	pipe_buffer_unmap(pscreen, fp->buffer);
 }
 
 static boolean
@@ -834,8 +835,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
 	struct nv30_fragment_program *fp = nv30->fragprog;
 	struct pipe_buffer *constbuf =
 		nv30->constbuf[PIPE_SHADER_FRAGMENT];
-	struct pipe_screen *screen = nv30->pipe.screen;
-	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct pipe_screen *pscreen = nv30->pipe.screen;
 	struct nouveau_stateobj *so;
 	boolean new_consts = FALSE;
 	int i;
@@ -850,14 +850,15 @@ nv30_fragprog_validate(struct nv30_context *nv30)
 		return FALSE;
 	}
 
-	fp->buffer = screen->buffer_create(screen, 0x100, 0, fp->insn_len * 4);
+	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv30_fragprog_upload(nv30, fp);
 
 	so = so_new(8, 1);
 	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
-	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-		  NV34TCL_FP_ACTIVE_PROGRAM_DMA0, NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
+		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
 	so_method(so, nv30->screen->rankine, NV34TCL_FP_CONTROL, 1);
 	so_data  (so, fp->fp_control);
 	so_method(so, nv30->screen->rankine, NV34TCL_FP_REG_CONTROL, 1);
@@ -871,7 +872,8 @@ update_constants:
 	if (fp->nr_consts) {
 		float *map;
 		
-		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		map = pipe_buffer_map(pscreen, constbuf,
+				      PIPE_BUFFER_USAGE_CPU_READ);
 		for (i = 0; i < fp->nr_consts; i++) {
 			struct nv30_fragment_program_data *fpd = &fp->consts[i];
 			uint32_t *p = &fp->insn[fpd->offset];
@@ -882,7 +884,7 @@ update_constants:
 			memcpy(p, cb, 4 * sizeof(float));
 			new_consts = TRUE;
 		}
-		ws->buffer_unmap(ws, constbuf);
+		pipe_buffer_unmap(pscreen, constbuf);
 
 		if (new_consts)
 			nv30_fragprog_upload(nv30, fp);
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
index 8b6ab992d1..822e1d8def 100644
--- a/src/gallium/drivers/nv30/nv30_fragtex.c
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -61,6 +61,7 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 	struct nv30_sampler_state *ps = nv30->tex_sampler[unit];
 	struct nv30_miptree *nv30mt = nv30->tex_miptree[unit];
 	struct pipe_texture *pt = &nv30mt->base;
+	struct nouveau_bo *bo = nouveau_bo(nv30mt->buffer);
 	struct nv30_texture_format *tf;
 	struct nouveau_stateobj *so;
 	uint32_t txf, txs , txp;
@@ -106,9 +107,9 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 
 	so = so_new(16, 2);
 	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
-	so_reloc (so, nv30mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
-	so_reloc (so, nv30mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
-		  NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
+		      NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
 	so_data  (so, ps->wrap);
 	so_data  (so, NV34TCL_TX_ENABLE_ENABLE | ps->en);
 	so_data  (so, txs);
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
index 2f974cf5c4..1d1c8a484e 100644
--- a/src/gallium/drivers/nv30/nv30_query.c
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -29,11 +29,10 @@ nv30_query_create(struct pipe_context *pipe, unsigned query_type)
 static void
 nv30_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
 {
-	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_query *q = nv30_query(pq);
 
 	if (q->object)
-		nv30->nvws->res_free(&q->object);
+		nouveau_resource_free(&q->object);
 	FREE(q);
 }
 
@@ -54,9 +53,9 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		pipe->get_query_result(pipe, pq, 1, &tmp);
 	}
 
-	if (nv30->nvws->res_alloc(nv30->screen->query_heap, 1, NULL, &q->object))
+	if (nouveau_resource_alloc(nv30->screen->query_heap, 1, NULL, &q->object))
 		assert(0);
-	nv30->nvws->notifier_reset(nv30->screen->query, q->object->start);
+	nouveau_notifier_reset(nv30->screen->query, q->object->start);
 
 	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
 	OUT_RING  (1);
@@ -84,27 +83,27 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_query *q = nv30_query(pq);
-	struct nouveau_winsys *nvws = nv30->nvws;
 
 	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
 	if (!q->ready) {
 		unsigned status;
 
-		status = nvws->notifier_status(nv30->screen->query,
-					       q->object->start);
+		status = nouveau_notifier_status(nv30->screen->query,
+						 q->object->start);
 		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
 			if (wait == FALSE)
 				return FALSE;
-			nvws->notifier_wait(nv30->screen->query, q->object->start,
-					    NV_NOTIFY_STATE_STATUS_COMPLETED,
-					    0);
+
+			nouveau_notifier_wait_status(nv30->screen->query,
+					q->object->start,
+					NV_NOTIFY_STATE_STATUS_COMPLETED, 0);
 		}
 
-		q->result = nvws->notifier_retval(nv30->screen->query,
-						  q->object->start);
+		q->result = nouveau_notifier_return_val(nv30->screen->query,
+							q->object->start);
 		q->ready = TRUE;
-		nvws->res_free(&q->object);
+		nouveau_resource_free(&q->object);
 	}
 
 	*result = q->result;
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index d395c5e1b7..31bc1f3173 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -1,5 +1,7 @@
 #include "pipe/p_screen.h"
-#include "util/u_simple_screen.h"
+#include "pipe/p_state.h"
+
+#include "nouveau/nouveau_screen.h"
 
 #include "nv30_context.h"
 #include "nv30_screen.h"
@@ -8,23 +10,6 @@
 #define NV34TCL_CHIPSET_3X_MASK 0x00000010
 #define NV35TCL_CHIPSET_3X_MASK 0x000001e0
 
-static const char *
-nv30_screen_get_name(struct pipe_screen *pscreen)
-{
-	struct nv30_screen *screen = nv30_screen(pscreen);
-	struct nouveau_device *dev = screen->nvws->channel->device;
-	static char buffer[128];
-
-	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
-}
-
-static const char *
-nv30_screen_get_vendor(struct pipe_screen *pscreen)
-{
-	return "nouveau";
-}
-
 static int
 nv30_screen_get_param(struct pipe_screen *pscreen, int param)
 {
@@ -139,45 +124,57 @@ static void
 nv30_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv30_screen *screen = nv30_screen(pscreen);
-	struct nouveau_winsys *nvws = screen->nvws;
 
-	nvws->res_free(&screen->vp_exec_heap);
-	nvws->res_free(&screen->vp_data_heap);
-	nvws->res_free(&screen->query_heap);
-	nvws->notifier_free(&screen->query);
-	nvws->notifier_free(&screen->sync);
-	nvws->grobj_free(&screen->rankine);
+	nouveau_resource_free(&screen->vp_exec_heap);
+	nouveau_resource_free(&screen->vp_data_heap);
+	nouveau_resource_free(&screen->query_heap);
+	nouveau_notifier_free(&screen->query);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->rankine);
 
 	FREE(pscreen);
 }
 
 struct pipe_screen *
-nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
 	struct nouveau_stateobj *so;
 	unsigned rankine_class = 0;
-	unsigned chipset = nvws->channel->device->chipset;
 	int ret, i;
 
 	if (!screen)
 		return NULL;
-	screen->nvws = nvws;
+	pscreen = &screen->base.base;
 
-	/* 2D engine setup */
-	screen->eng2d = nv04_surface_2d_init(nvws);
-	screen->eng2d->buf = nv30_surface_buffer;
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv30_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv30_screen_destroy;
+	pscreen->get_param = nv30_screen_get_param;
+	pscreen->get_paramf = nv30_screen_get_paramf;
+	pscreen->is_format_supported = nv30_screen_surface_format_supported;
+
+	nv30_screen_init_miptree_functions(pscreen);
+	nv30_screen_init_transfer_functions(pscreen);
 
 	/* 3D object */
-	switch (chipset & 0xf0) {
+	switch (dev->chipset & 0xf0) {
 	case 0x30:
-		if (NV30TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
 			rankine_class = 0x0397;
 		else
-		if (NV34TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+		if (NV34TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
 			rankine_class = 0x0697;
 		else
-		if (NV35TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+		if (NV35TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
 			rankine_class = 0x0497;
 		break;
 	default:
@@ -185,43 +182,49 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	}
 
 	if (!rankine_class) {
-		NOUVEAU_ERR("Unknown nv3x chipset: nv%02x\n", chipset);
+		NOUVEAU_ERR("Unknown nv3x chipset: nv%02x\n", dev->chipset);
 		return NULL;
 	}
 
-	ret = nvws->grobj_alloc(nvws, rankine_class, &screen->rankine);
+	ret = nouveau_grobj_alloc(chan, 0xbeef3097, rankine_class,
+				  &screen->rankine);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
+	BIND_RING(chan, screen->rankine, 7);
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nv30_surface_buffer;
 
 	/* Notifier for sync purposes */
-	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv30_screen_destroy(&screen->pipe);
+		nv30_screen_destroy(pscreen);
 		return NULL;
 	}
 
 	/* Query objects */
-	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0302, 32, &screen->query);
 	if (ret) {
 		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
-		nv30_screen_destroy(&screen->pipe);
+		nv30_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	ret = nouveau_resource_init(&screen->query_heap, 0, 32);
 	if (ret) {
 		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
-		nv30_screen_destroy(&screen->pipe);
+		nv30_screen_destroy(pscreen);
 		return NULL;
 	}
 
 	/* Vtxprog resources */
-	if (nvws->res_init(&screen->vp_exec_heap, 0, 256) ||
-	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
-		nv30_screen_destroy(&screen->pipe);
+	if (nouveau_resource_init(&screen->vp_exec_heap, 0, 256) ||
+	    nouveau_resource_init(&screen->vp_data_heap, 0, 256)) {
+		nv30_screen_destroy(pscreen);
 		return NULL;
 	}
 
@@ -230,23 +233,23 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->gart->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->gart->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_COLOR1, 1);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_COLOR0, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_VTXBUF0, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->gart->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->gart->handle);
 /*	so_method(so, screen->rankine, NV34TCL_DMA_FENCE, 2);
 	so_data  (so, 0);
 	so_data  (so, screen->query->handle);*/
 	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY7, 1);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY8, 1);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
 
 	for (i=1; i<8; i++) {
 		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1);
@@ -301,23 +304,9 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->rankine, 0x1e94, 1);
 	so_data  (so, 0x13);
 
-	so_emit(nvws, so);
+	so_emit(chan, so);
 	so_ref(NULL, &so);
-	nvws->push_flush(nvws, 0, NULL);
-
-	screen->pipe.winsys = ws;
-	screen->pipe.destroy = nv30_screen_destroy;
-
-	screen->pipe.get_name = nv30_screen_get_name;
-	screen->pipe.get_vendor = nv30_screen_get_vendor;
-	screen->pipe.get_param = nv30_screen_get_param;
-	screen->pipe.get_paramf = nv30_screen_get_paramf;
-
-	screen->pipe.is_format_supported = nv30_screen_surface_format_supported;
-
-	nv30_screen_init_miptree_functions(&screen->pipe);
-	nv30_screen_init_transfer_functions(&screen->pipe);
-	u_simple_screen_init(&screen->pipe);
+	nouveau_pushbuf_flush(chan, 0);
 
-	return &screen->pipe;
+	return pscreen;
 }
diff --git a/src/gallium/drivers/nv30/nv30_screen.h b/src/gallium/drivers/nv30/nv30_screen.h
index 8e36883975..5fbd998b53 100644
--- a/src/gallium/drivers/nv30/nv30_screen.h
+++ b/src/gallium/drivers/nv30/nv30_screen.h
@@ -1,11 +1,12 @@
 #ifndef __NV30_SCREEN_H__
 #define __NV30_SCREEN_H__
 
-#include "pipe/p_screen.h"
+#include "nouveau/nouveau_screen.h"
+
 #include "nv04/nv04_surface_2d.h"
 
 struct nv30_screen {
-	struct pipe_screen pipe;
+	struct nouveau_screen base;
 
 	struct nouveau_winsys *nvws;
 
diff --git a/src/gallium/drivers/nv30/nv30_state_emit.c b/src/gallium/drivers/nv30/nv30_state_emit.c
index c18be20a32..621b8846c8 100644
--- a/src/gallium/drivers/nv30/nv30_state_emit.c
+++ b/src/gallium/drivers/nv30/nv30_state_emit.c
@@ -38,6 +38,7 @@ nv30_state_do_validate(struct nv30_context *nv30,
 void
 nv30_state_emit(struct nv30_context *nv30)
 {
+	struct nouveau_channel *chan = nv30->screen->base.channel;
 	struct nv30_state *state = &nv30->state;
 	struct nv30_screen *screen = nv30->screen;
 	unsigned i, samplers;
@@ -57,23 +58,23 @@ nv30_state_emit(struct nv30_context *nv30)
 			continue;
 		so_ref (state->hw[i], &nv30->screen->state[i]);
 		if (state->hw[i])
-			so_emit(nv30->nvws, nv30->screen->state[i]);
+			so_emit(chan, nv30->screen->state[i]);
 		states &= ~(1ULL << i);
 	}
 
 	state->dirty = 0;
 
-	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FB]);
+	so_emit_reloc_markers(chan, state->hw[NV30_STATE_FB]);
 	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
 		if (!(samplers & (1 << i)))
 			continue;
-		so_emit_reloc_markers(nv30->nvws,
+		so_emit_reloc_markers(chan,
 				      state->hw[NV30_STATE_FRAGTEX0+i]);
 		samplers &= ~(1ULL << i);
 	}
-	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FRAGPROG]);
+	so_emit_reloc_markers(chan, state->hw[NV30_STATE_FRAGPROG]);
 	if (state->hw[NV30_STATE_VTXBUF] /*&& nv30->render_mode == HW*/)
-		so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_VTXBUF]);
+		so_emit_reloc_markers(chan, state->hw[NV30_STATE_VTXBUF]);
 }
 
 boolean
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
index fdc1cade90..cb1a260eaa 100644
--- a/src/gallium/drivers/nv30/nv30_state_fb.c
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -5,6 +5,8 @@ static boolean
 nv30_state_framebuffer_validate(struct nv30_context *nv30)
 {
 	struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
 	struct nv04_surface *rt[2], *zeta = NULL;
 	uint32_t rt_enable, rt_format;
 	int i, colour_format = 0, zeta_format = 0;
@@ -79,56 +81,53 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 		}
 
 		nv30mt = (struct nv30_miptree *)rt[0]->base.texture;
-		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR0, 1);
-		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
-			  nv30->nvws->channel->vram->handle,
-			  nv30->nvws->channel->gart->handle);
-		so_method(so, nv30->screen->rankine, NV34TCL_COLOR0_PITCH, 2);
+		so_method(so, rankine, NV34TCL_DMA_COLOR0, 1);
+		so_reloc (so, nouveau_bo(nv30mt->buffer), 0, rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, rankine, NV34TCL_COLOR0_PITCH, 2);
 		so_data  (so, pitch);
-		so_reloc (so, nv30mt->buffer, rt[0]->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (so, nouveau_bo(nv30mt->buffer), rt[0]->base.offset,
+			      rt_flags | NOUVEAU_BO_LOW, 0, 0);
 	}
 
 	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
 		nv30mt = (struct nv30_miptree *)rt[1]->base.texture;
-		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR1, 1);
-		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
-			  nv30->nvws->channel->vram->handle,
-			  nv30->nvws->channel->gart->handle);
-		so_method(so, nv30->screen->rankine, NV34TCL_COLOR1_OFFSET, 2);
-		so_reloc (so, nv30mt->buffer, rt[1]->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, rankine, NV34TCL_DMA_COLOR1, 1);
+		so_reloc (so, nouveau_bo(nv30mt->buffer), 0, rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, rankine, NV34TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, nouveau_bo(nv30mt->buffer), rt[1]->base.offset,
+			      rt_flags | NOUVEAU_BO_LOW, 0, 0);
 		so_data  (so, rt[1]->pitch);
 	}
 
 	if (zeta_format) {
 		nv30mt = (struct nv30_miptree *)zeta->base.texture;
-		so_method(so, nv30->screen->rankine, NV34TCL_DMA_ZETA, 1);
-		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
-			  nv30->nvws->channel->vram->handle,
-			  nv30->nvws->channel->gart->handle);
-		so_method(so, nv30->screen->rankine, NV34TCL_ZETA_OFFSET, 1);
-		so_reloc (so, nv30mt->buffer, zeta->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, rankine, NV34TCL_DMA_ZETA, 1);
+		so_reloc (so, nouveau_bo(nv30mt->buffer), 0, rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, rankine, NV34TCL_ZETA_OFFSET, 1);
+		so_reloc (so, nouveau_bo(nv30mt->buffer), zeta->base.offset,
+			      rt_flags | NOUVEAU_BO_LOW, 0, 0);
 		/* TODO: allocate LMA depth buffer */
 	}
 
-	so_method(so, nv30->screen->rankine, NV34TCL_RT_ENABLE, 1);
+	so_method(so, rankine, NV34TCL_RT_ENABLE, 1);
 	so_data  (so, rt_enable);
-	so_method(so, nv30->screen->rankine, NV34TCL_RT_HORIZ, 3);
+	so_method(so, rankine, NV34TCL_RT_HORIZ, 3);
 	so_data  (so, (w << 16) | 0);
 	so_data  (so, (h << 16) | 0);
 	so_data  (so, rt_format);
-	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_HORIZ, 2);
+	so_method(so, rankine, NV34TCL_VIEWPORT_HORIZ, 2);
 	so_data  (so, (w << 16) | 0);
 	so_data  (so, (h << 16) | 0);
-	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_method(so, rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
 	so_data  (so, ((w - 1) << 16) | 0);
 	so_data  (so, ((h - 1) << 16) | 0);
-	so_method(so, nv30->screen->rankine, 0x1d88, 1);
+	so_method(so, rankine, 0x1d88, 1);
 	so_data  (so, (1 << 12) | h);
 	/* Wonder why this is needed, context should all be set to zero on init */
-	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_TX_ORIGIN, 1);
+	so_method(so, rankine, NV34TCL_VIEWPORT_TX_ORIGIN, 1);
 	so_data  (so, 0);
 
 	so_ref(so, &nv30->state.hw[NV30_STATE_FB]);
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
index 990a876382..189656ec81 100644
--- a/src/gallium/drivers/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -1,5 +1,6 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "nv30_context.h"
 #include "nv30_state.h"
@@ -70,7 +71,7 @@ static boolean
 nv30_vbo_set_idxbuf(struct nv30_context *nv30, struct pipe_buffer *ib,
 		    unsigned ib_size)
 {
-	struct pipe_screen *pscreen = &nv30->screen->pipe;
+	struct pipe_screen *pscreen = &nv30->screen->base.base;
 	unsigned type;
 
 	if (!ib) {
@@ -108,7 +109,7 @@ nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
 		       int attrib, struct pipe_vertex_element *ve,
 		       struct pipe_vertex_buffer *vb)
 {
-	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct pipe_screen *pscreen = nv30->pipe.screen;
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 	unsigned type, ncomp;
 	void *map;
@@ -116,7 +117,7 @@ nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
 	if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp))
 		return FALSE;
 
-	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map  = pipe_buffer_map(pscreen, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
 	map += vb->buffer_offset + ve->src_offset;
 
 	switch (type) {
@@ -148,18 +149,17 @@ nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
 			so_data  (so, fui(v[0]));
 			break;
 		default:
-			ws->buffer_unmap(ws, vb->buffer);
+			pipe_buffer_unmap(pscreen, vb->buffer);
 			return FALSE;
 		}
 	}
 		break;
 	default:
-		ws->buffer_unmap(ws, vb->buffer);
+		pipe_buffer_unmap(pscreen, vb->buffer);
 		return FALSE;
 	}
 
-	ws->buffer_unmap(ws, vb->buffer);
-
+	pipe_buffer_unmap(pscreen, vb->buffer);
 	return TRUE;
 }
 
@@ -168,7 +168,7 @@ nv30_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->nvws->channel;
+	struct nouveau_channel *chan = nv30->screen->base.channel;
 	unsigned restart = 0;
 
 	nv30_vbo_set_idxbuf(nv30, NULL, 0);
@@ -228,7 +228,7 @@ static INLINE void
 nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->nvws->channel;
+	struct nouveau_channel *chan = nv30->screen->base.channel;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -277,7 +277,7 @@ static INLINE void
 nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->nvws->channel;
+	struct nouveau_channel *chan = nv30->screen->base.channel;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -326,7 +326,7 @@ static INLINE void
 nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->nvws->channel;
+	struct nouveau_channel *chan = nv30->screen->base.channel;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -368,10 +368,10 @@ nv30_draw_elements_inline(struct pipe_context *pipe,
 			  unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct pipe_winsys *ws = pipe->winsys;
+	struct pipe_screen *pscreen = pipe->screen;
 	void *map;
 
-	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	map = pipe_buffer_map(pscreen, ib, PIPE_BUFFER_USAGE_CPU_READ);
 	if (!ib) {
 		NOUVEAU_ERR("failed mapping ib\n");
 		return FALSE;
@@ -392,7 +392,7 @@ nv30_draw_elements_inline(struct pipe_context *pipe,
 		break;
 	}
 
-	ws->buffer_unmap(ws, ib);
+	pipe_buffer_unmap(pscreen, ib);
 	return TRUE;
 }
 
@@ -401,7 +401,7 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->nvws->channel;
+	struct nouveau_channel *chan = nv30->screen->base.channel;
 	unsigned restart = 0;
 
 	while (count) {
@@ -521,18 +521,20 @@ nv30_vbo_validate(struct nv30_context *nv30)
 			return FALSE;
 		}
 
-		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
-			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-			 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		so_reloc(vtxbuf, nouveau_bo(vb->buffer), vb->buffer_offset +
+				 ve->src_offset, vb_flags | NOUVEAU_BO_LOW |
+				 NOUVEAU_BO_OR, 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
 		so_data (vtxfmt, ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
 				  (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type));
 	}
 
 	if (ib) {
+		struct nouveau_bo *bo = nouveau_bo(ib);
+
 		so_method(vtxbuf, rankine, NV34TCL_IDXBUF_ADDRESS, 2);
-		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
-		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
-			  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+		so_reloc (vtxbuf, bo, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, bo, ib_format, vb_flags | NOUVEAU_BO_OR,
+				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
 	}
 
 	so_method(vtxbuf, rankine, 0x1710, 1);
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
index eaf543b8f7..c7514efcfe 100644
--- a/src/gallium/drivers/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -1,6 +1,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
@@ -645,8 +646,7 @@ out_err:
 static boolean
 nv30_vertprog_validate(struct nv30_context *nv30)
 { 
-	struct nouveau_winsys *nvws = nv30->nvws;
-	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct pipe_screen *pscreen = nv30->pipe.screen;
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 	struct nv30_vertex_program *vp;
 	struct pipe_buffer *constbuf;
@@ -669,15 +669,15 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 		struct nouveau_stateobj *so;
 		uint vplen = vp->nr_insns;
 
-		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+		if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
 			while (heap->next && heap->size < vplen) {
 				struct nv30_vertex_program *evict;
 				
 				evict = heap->next->priv;
-				nvws->res_free(&evict->exec);
+				nouveau_resource_free(&evict->exec);
 			}
 
-			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
 				assert(0);
 		}
 
@@ -694,15 +694,16 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 	if (vp->nr_consts && !vp->data) {
 		struct nouveau_resource *heap = nv30->screen->vp_data_heap;
 
-		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+		if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
 			while (heap->next && heap->size < vp->nr_consts) {
 				struct nv30_vertex_program *evict;
 				
 				evict = heap->next->priv;
-				nvws->res_free(&evict->data);
+				nouveau_resource_free(&evict->data);
 			}
 
-			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+			if (nouveau_resource_alloc(heap, vp->nr_consts, vp,
+						   &vp->data))
 				assert(0);
 		}
 
@@ -750,8 +751,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 		float *map = NULL;
 
 		if (constbuf) {
-			map = ws->buffer_map(ws, constbuf,
-					     PIPE_BUFFER_USAGE_CPU_READ);
+			map = pipe_buffer_map(pscreen, constbuf,
+					      PIPE_BUFFER_USAGE_CPU_READ);
 		}
 
 		for (i = 0; i < vp->nr_consts; i++) {
@@ -771,9 +772,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 			OUT_RINGp ((uint32_t *)vpd->value, 4);
 		}
 
-		if (constbuf) {
-			ws->buffer_unmap(ws, constbuf);
-		}
+		if (constbuf)
+			pipe_buffer_unmap(pscreen, constbuf);
 	}
 
 	/* Upload vtxprog */
@@ -804,8 +804,6 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 void
 nv30_vertprog_destroy(struct nv30_context *nv30, struct nv30_vertex_program *vp)
 {
-	struct nouveau_winsys *nvws = nv30->screen->nvws;
-
 	vp->translated = FALSE;
 
 	if (vp->nr_insns) {
@@ -820,9 +818,9 @@ nv30_vertprog_destroy(struct nv30_context *nv30, struct nv30_vertex_program *vp)
 		vp->nr_consts = 0;
 	}
 
-	nvws->res_free(&vp->exec);
+	nouveau_resource_free(&vp->exec);
 	vp->exec_start = 0;
-	nvws->res_free(&vp->data);
+	nouveau_resource_free(&vp->data);
 	vp->data_start = 0;
 	vp->data_start_min = 0;
 
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
index c83ff91d7e..b2f19ecb69 100644
--- a/src/gallium/drivers/nv40/nv40_draw.c
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -1,4 +1,5 @@
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_inlines.h"
 
 #include "util/u_pack_color.h"
 
@@ -81,7 +82,7 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 {
 	struct nv40_render_stage *rs = nv40_render_stage(stage);
 	struct nv40_context *nv40 = rs->nv40;
-	struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf;
+	struct nouveau_pushbuf *pb = nv40->screen->base.channel->pushbuf;
 	unsigned i;
 
 	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
@@ -231,7 +232,7 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 			 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct pipe_winsys *ws = pipe->winsys;
+	struct pipe_screen *pscreen = pipe->screen;
 	unsigned i;
 	void *map;
 
@@ -241,13 +242,14 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 	nv40_state_emit(nv40);
 
 	for (i = 0; i < nv40->vtxbuf_nr; i++) {
-		map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer,
+		map = pipe_buffer_map(pscreen, nv40->vtxbuf[i].buffer,
                                       PIPE_BUFFER_USAGE_CPU_READ);
 		draw_set_mapped_vertex_buffer(nv40->draw, i, map);
 	}
 
 	if (idxbuf) {
-		map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		map = pipe_buffer_map(pscreen, idxbuf,
+				      PIPE_BUFFER_USAGE_CPU_READ);
 		draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map);
 	} else {
 		draw_set_mapped_element_buffer(nv40->draw, 0, NULL);
@@ -256,21 +258,22 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 	if (nv40->constbuf[PIPE_SHADER_VERTEX]) {
 		const unsigned nr = nv40->constbuf_nr[PIPE_SHADER_VERTEX];
 
-		map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX],
-				     PIPE_BUFFER_USAGE_CPU_READ);
+		map = pipe_buffer_map(pscreen,
+				      nv40->constbuf[PIPE_SHADER_VERTEX],
+				      PIPE_BUFFER_USAGE_CPU_READ);
 		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
 	}
 
 	draw_arrays(nv40->draw, mode, start, count);
 
 	for (i = 0; i < nv40->vtxbuf_nr; i++)
-		ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer);
+		pipe_buffer_unmap(pscreen, nv40->vtxbuf[i].buffer);
 
 	if (idxbuf)
-		ws->buffer_unmap(ws, idxbuf);
+		pipe_buffer_unmap(pscreen, idxbuf);
 
 	if (nv40->constbuf[PIPE_SHADER_VERTEX])
-		ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]);
+		pipe_buffer_unmap(pscreen, nv40->constbuf[PIPE_SHADER_VERTEX]);
 
 	draw_flush(nv40->draw);
 	pipe->flush(pipe, 0, NULL);
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
index 16e40889ec..680976da56 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -1,6 +1,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
@@ -881,12 +882,12 @@ static void
 nv40_fragprog_upload(struct nv40_context *nv40,
 		     struct nv40_fragment_program *fp)
 {
-	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct pipe_screen *pscreen = nv40->pipe.screen;
 	const uint32_t le = 1;
 	uint32_t *map;
 	int i;
 
-	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+	map = pipe_buffer_map(pscreen, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
 
 #if 0
 	for (i = 0; i < fp->insn_len; i++) {
@@ -908,7 +909,7 @@ nv40_fragprog_upload(struct nv40_context *nv40,
 		}
 	}
 
-	ws->buffer_unmap(ws, fp->buffer);
+	pipe_buffer_unmap(pscreen, fp->buffer);
 }
 
 static boolean
@@ -917,8 +918,7 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 	struct nv40_fragment_program *fp = nv40->fragprog;
 	struct pipe_buffer *constbuf =
 		nv40->constbuf[PIPE_SHADER_FRAGMENT];
-	struct pipe_screen *screen = nv40->pipe.screen;
-	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct pipe_screen *pscreen = nv40->pipe.screen;
 	struct nouveau_stateobj *so;
 	boolean new_consts = FALSE;
 	int i;
@@ -933,14 +933,15 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 		return FALSE;
 	}
 
-	fp->buffer = screen->buffer_create(screen, 0x100, 0, fp->insn_len * 4);
+	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv40_fragprog_upload(nv40, fp);
 
 	so = so_new(4, 1);
 	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
-	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
+		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+		      NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0,
+		      NV40TCL_FP_ADDRESS_DMA1);
 	so_method(so, nv40->screen->curie, NV40TCL_FP_CONTROL, 1);
 	so_data  (so, fp->fp_control);
 	so_ref(so, &fp->so);
@@ -950,7 +951,8 @@ update_constants:
 	if (fp->nr_consts) {
 		float *map;
 		
-		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		map = pipe_buffer_map(pscreen, constbuf,
+				      PIPE_BUFFER_USAGE_CPU_READ);
 		for (i = 0; i < fp->nr_consts; i++) {
 			struct nv40_fragment_program_data *fpd = &fp->consts[i];
 			uint32_t *p = &fp->insn[fpd->offset];
@@ -961,7 +963,7 @@ update_constants:
 			memcpy(p, cb, 4 * sizeof(float));
 			new_consts = TRUE;
 		}
-		ws->buffer_unmap(ws, constbuf);
+		pipe_buffer_unmap(pscreen, constbuf);
 
 		if (new_consts)
 			nv40_fragprog_upload(nv40, fp);
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
index eb3002dc05..f6cdf31dfe 100644
--- a/src/gallium/drivers/nv40/nv40_fragtex.c
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -62,6 +62,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 {
 	struct nv40_sampler_state *ps = nv40->tex_sampler[unit];
 	struct nv40_miptree *nv40mt = nv40->tex_miptree[unit];
+	struct nouveau_bo *bo = nouveau_bo(nv40mt->buffer);
 	struct pipe_texture *pt = &nv40mt->base;
 	struct nv40_texture_format *tf;
 	struct nouveau_stateobj *so;
@@ -108,9 +109,9 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 
 	so = so_new(16, 2);
 	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
-	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
-	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
-		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
+	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
+		      NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
 	so_data  (so, ps->wrap);
 	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
 	so_data  (so, txs);
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
index 9b9a43f49d..7874aedd42 100644
--- a/src/gallium/drivers/nv40/nv40_query.c
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -29,11 +29,10 @@ nv40_query_create(struct pipe_context *pipe, unsigned query_type)
 static void
 nv40_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
 {
-	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
 
 	if (q->object)
-		nv40->nvws->res_free(&q->object);
+		nouveau_resource_free(&q->object);
 	FREE(q);
 }
 
@@ -54,9 +53,9 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		pipe->get_query_result(pipe, pq, 1, &tmp);
 	}
 
-	if (nv40->nvws->res_alloc(nv40->screen->query_heap, 1, NULL, &q->object))
+	if (nouveau_resource_alloc(nv40->screen->query_heap, 1, NULL, &q->object))
 		assert(0);
-	nv40->nvws->notifier_reset(nv40->screen->query, q->object->start);
+	nouveau_notifier_reset(nv40->screen->query, q->object->start);
 
 	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
 	OUT_RING  (1);
@@ -84,27 +83,27 @@ nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
-	struct nouveau_winsys *nvws = nv40->nvws;
 
 	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
 	if (!q->ready) {
 		unsigned status;
 
-		status = nvws->notifier_status(nv40->screen->query,
-					       q->object->start);
+		status = nouveau_notifier_status(nv40->screen->query,
+						 q->object->start);
 		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
 			if (wait == FALSE)
 				return FALSE;
-			nvws->notifier_wait(nv40->screen->query, q->object->start,
-					    NV_NOTIFY_STATE_STATUS_COMPLETED,
-					    0);
+			nouveau_notifier_wait_status(nv40->screen->query,
+					      q->object->start,
+					      NV_NOTIFY_STATE_STATUS_COMPLETED,
+					      0);
 		}
 
-		q->result = nvws->notifier_retval(nv40->screen->query,
-						  q->object->start);
+		q->result = nouveau_notifier_return_val(nv40->screen->query,
+							q->object->start);
 		q->ready = TRUE;
-		nvws->res_free(&q->object);
+		nouveau_resource_free(&q->object);
 	}
 
 	*result = q->result;
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index 0d4baefaea..b8b2af482a 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -1,5 +1,4 @@
 #include "pipe/p_screen.h"
-#include "util/u_simple_screen.h"
 
 #include "nv40_context.h"
 #include "nv40_screen.h"
@@ -8,23 +7,6 @@
 #define NV4X_GRCLASS4497_CHIPSETS 0x00005450
 #define NV6X_GRCLASS4497_CHIPSETS 0x00000088
 
-static const char *
-nv40_screen_get_name(struct pipe_screen *pscreen)
-{
-	struct nv40_screen *screen = nv40_screen(pscreen);
-	struct nouveau_device *dev = screen->nvws->channel->device;
-	static char buffer[128];
-
-	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
-}
-
-static const char *
-nv40_screen_get_vendor(struct pipe_screen *pscreen)
-{
-	return "nouveau";
-}
-
 static int
 nv40_screen_get_param(struct pipe_screen *pscreen, int param)
 {
@@ -148,88 +130,107 @@ static void
 nv40_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv40_screen *screen = nv40_screen(pscreen);
-	struct nouveau_winsys *nvws = screen->nvws;
 
-	nvws->res_free(&screen->vp_exec_heap);
-	nvws->res_free(&screen->vp_data_heap);
-	nvws->res_free(&screen->query_heap);
-	nvws->notifier_free(&screen->query);
-	nvws->notifier_free(&screen->sync);
-	nvws->grobj_free(&screen->curie);
+	nouveau_resource_free(&screen->vp_exec_heap);
+	nouveau_resource_free(&screen->vp_data_heap);
+	nouveau_resource_free(&screen->query_heap);
+	nouveau_notifier_free(&screen->query);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->curie);
+
+	nouveau_screen_fini(&screen->base);
 
 	FREE(pscreen);
 }
 
 struct pipe_screen *
-nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv40_screen *screen = CALLOC_STRUCT(nv40_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
 	struct nouveau_stateobj *so;
 	unsigned curie_class = 0;
-	unsigned chipset = nvws->channel->device->chipset;
 	int ret;
 
 	if (!screen)
 		return NULL;
-	screen->nvws = nvws;
+	pscreen = &screen->base.base;
 
-	/* 2D engine setup */
-	screen->eng2d = nv04_surface_2d_init(nvws);
-	screen->eng2d->buf = nv40_surface_buffer;
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv40_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv40_screen_destroy;
+	pscreen->get_param = nv40_screen_get_param;
+	pscreen->get_paramf = nv40_screen_get_paramf;
+	pscreen->is_format_supported = nv40_screen_surface_format_supported;
+
+	nv40_screen_init_miptree_functions(pscreen);
+	nv40_screen_init_transfer_functions(pscreen);
 
 	/* 3D object */
-	switch (chipset & 0xf0) {
+	switch (dev->chipset & 0xf0) {
 	case 0x40:
-		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f)))
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (dev->chipset & 0x0f)))
 			curie_class = NV40TCL;
 		else
-		if (NV4X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+		if (NV4X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
 			curie_class = NV44TCL;
 		break;
 	case 0x60:
-		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
 			curie_class = NV44TCL;
 		break;
 	}
 
 	if (!curie_class) {
-		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", chipset);
+		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", dev->chipset);
 		return NULL;
 	}
 
-	ret = nvws->grobj_alloc(nvws, curie_class, &screen->curie);
+	ret = nouveau_grobj_alloc(chan, 0xbeef3097, curie_class, &screen->curie);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
+	BIND_RING(chan, screen->curie, 7);
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nv40_surface_buffer;
 
 	/* Notifier for sync purposes */
-	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv40_screen_destroy(&screen->pipe);
+		nv40_screen_destroy(pscreen);
 		return NULL;
 	}
 
 	/* Query objects */
-	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0302, 32, &screen->query);
 	if (ret) {
 		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
-		nv40_screen_destroy(&screen->pipe);
+		nv40_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	nouveau_resource_init(&screen->query_heap, 0, 32);
 	if (ret) {
 		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
-		nv40_screen_destroy(&screen->pipe);
+		nv40_screen_destroy(pscreen);
 		return NULL;
 	}
 
 	/* Vtxprog resources */
-	if (nvws->res_init(&screen->vp_exec_heap, 0, 512) ||
-	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
-		nv40_screen_destroy(&screen->pipe);
+	if (nouveau_resource_init(&screen->vp_exec_heap, 0, 512) ||
+	    nouveau_resource_init(&screen->vp_data_heap, 0, 256)) {
+		nv40_screen_destroy(pscreen);
 		return NULL;
 	}
 
@@ -238,25 +239,25 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->gart->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->gart->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_COLOR1, 1);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_COLOR0, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_VTXBUF0, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->gart->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->gart->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_FENCE, 2);
 	so_data  (so, 0);
 	so_data  (so, screen->query->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_UNK01AC, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_COLOR2, 2);
-	so_data  (so, nvws->channel->vram->handle);
-	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
 
 	so_method(so, screen->curie, 0x1ea4, 3);
 	so_data  (so, 0x00000010);
@@ -281,24 +282,10 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->curie, 0x1e94, 1);
 	so_data  (so, 0x00000001);
 
-	so_emit(nvws, so);
+	so_emit(chan, so);
 	so_ref(NULL, &so);
-	nvws->push_flush(nvws, 0, NULL);
-
-	screen->pipe.winsys = ws;
-	screen->pipe.destroy = nv40_screen_destroy;
-
-	screen->pipe.get_name = nv40_screen_get_name;
-	screen->pipe.get_vendor = nv40_screen_get_vendor;
-	screen->pipe.get_param = nv40_screen_get_param;
-	screen->pipe.get_paramf = nv40_screen_get_paramf;
-
-	screen->pipe.is_format_supported = nv40_screen_surface_format_supported;
-
-	nv40_screen_init_miptree_functions(&screen->pipe);
-	nv40_screen_init_transfer_functions(&screen->pipe);
-	u_simple_screen_init(&screen->pipe);
+	nouveau_pushbuf_flush(chan, 0);
 
-	return &screen->pipe;
+	return pscreen;
 }
 
diff --git a/src/gallium/drivers/nv40/nv40_screen.h b/src/gallium/drivers/nv40/nv40_screen.h
index 7b503bd207..57b4c8fc46 100644
--- a/src/gallium/drivers/nv40/nv40_screen.h
+++ b/src/gallium/drivers/nv40/nv40_screen.h
@@ -1,11 +1,11 @@
 #ifndef __NV40_SCREEN_H__
 #define __NV40_SCREEN_H__
 
-#include "pipe/p_screen.h"
+#include "nouveau/nouveau_screen.h"
 #include "nv04/nv04_surface_2d.h"
 
 struct nv40_screen {
-	struct pipe_screen pipe;
+	struct nouveau_screen base;
 
 	struct nouveau_winsys *nvws;
 
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
index 10aae29832..198692965d 100644
--- a/src/gallium/drivers/nv40/nv40_state_emit.c
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -54,6 +54,7 @@ nv40_state_do_validate(struct nv40_context *nv40,
 void
 nv40_state_emit(struct nv40_context *nv40)
 {
+	struct nouveau_channel *chan = nv40->screen->base.channel;
 	struct nv40_state *state = &nv40->state;
 	struct nv40_screen *screen = nv40->screen;
 	unsigned i, samplers;
@@ -73,7 +74,7 @@ nv40_state_emit(struct nv40_context *nv40)
 			continue;
 		so_ref (state->hw[i], &nv40->screen->state[i]);
 		if (state->hw[i])
-			so_emit(nv40->nvws, nv40->screen->state[i]);
+			so_emit(chan, nv40->screen->state[i]);
 		states &= ~(1ULL << i);
 	}
 
@@ -87,17 +88,17 @@ nv40_state_emit(struct nv40_context *nv40)
 
 	state->dirty = 0;
 
-	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]);
+	so_emit_reloc_markers(chan, state->hw[NV40_STATE_FB]);
 	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
 		if (!(samplers & (1 << i)))
 			continue;
-		so_emit_reloc_markers(nv40->nvws,
+		so_emit_reloc_markers(chan,
 				      state->hw[NV40_STATE_FRAGTEX0+i]);
 		samplers &= ~(1ULL << i);
 	}
-	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]);
+	so_emit_reloc_markers(chan, state->hw[NV40_STATE_FRAGPROG]);
 	if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW)
-		so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
+		so_emit_reloc_markers(chan, state->hw[NV40_STATE_VTXBUF]);
 }
 
 boolean
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
index be618a306b..273142f9e0 100644
--- a/src/gallium/drivers/nv40/nv40_state_fb.c
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -2,15 +2,19 @@
 #include "nouveau/nouveau_util.h"
 
 static struct pipe_buffer *
-nv40_surface_buffer(struct pipe_surface *surface)
+nv40_do_surface_buffer(struct pipe_surface *surface)
 {
 	struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
 	return mt->buffer;
 }
 
+#define nv40_surface_buffer(ps) nouveau_bo(nv40_do_surface_buffer(ps))
+
 static boolean
 nv40_state_framebuffer_validate(struct nv40_context *nv40)
 {
+	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nouveau_grobj *curie = nv40->screen->curie;
 	struct pipe_framebuffer_state *fb = &nv40->framebuffer;
 	struct nv04_surface *rt[4], *zeta;
 	uint32_t rt_enable, rt_format;
@@ -77,76 +81,80 @@ nv40_state_framebuffer_validate(struct nv40_context *nv40)
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR0, 1);
-		so_reloc (so, nv40_surface_buffer(&rt[0]->base), 0, rt_flags | NOUVEAU_BO_OR,
-			  nv40->nvws->channel->vram->handle,
-			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->screen->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_method(so, curie, NV40TCL_DMA_COLOR0, 1);
+		so_reloc (so, nv40_surface_buffer(&rt[0]->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, curie, NV40TCL_COLOR0_PITCH, 2);
 		so_data  (so, rt[0]->pitch);
-		so_reloc (so, nv40_surface_buffer(&rt[0]->base), rt[0]->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (so, nv40_surface_buffer(&rt[0]->base),
+			      rt[0]->base.offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR1, 1);
-		so_reloc (so, nv40_surface_buffer(&rt[1]->base), 0, rt_flags | NOUVEAU_BO_OR,
-			  nv40->nvws->channel->vram->handle,
-			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->screen->curie, NV40TCL_COLOR1_OFFSET, 2);
-		so_reloc (so, nv40_surface_buffer(&rt[1]->base), rt[1]->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, curie, NV40TCL_DMA_COLOR1, 1);
+		so_reloc (so, nv40_surface_buffer(&rt[1]->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, nv40_surface_buffer(&rt[1]->base),
+			      rt[1]->base.offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
 		so_data  (so, rt[1]->pitch);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
-		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR2, 1);
-		so_reloc (so, nv40_surface_buffer(&rt[2]->base), 0, rt_flags | NOUVEAU_BO_OR,
-			  nv40->nvws->channel->vram->handle,
-			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_OFFSET, 1);
-		so_reloc (so, nv40_surface_buffer(&rt[2]->base), rt[2]->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
-		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_method(so, curie, NV40TCL_DMA_COLOR2, 1);
+		so_reloc (so, nv40_surface_buffer(&rt[2]->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(&rt[2]->base),
+			      rt[2]->base.offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+		so_method(so, curie, NV40TCL_COLOR2_PITCH, 1);
 		so_data  (so, rt[2]->pitch);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
-		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR3, 1);
-		so_reloc (so, nv40_surface_buffer(&rt[3]->base), 0, rt_flags | NOUVEAU_BO_OR,
-			  nv40->nvws->channel->vram->handle,
-			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_OFFSET, 1);
-		so_reloc (so, nv40_surface_buffer(&rt[3]->base), rt[3]->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
-		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_method(so, curie, NV40TCL_DMA_COLOR3, 1);
+		so_reloc (so, nv40_surface_buffer(&rt[3]->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(&rt[3]->base),
+			      rt[3]->base.offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+		so_method(so, curie, NV40TCL_COLOR3_PITCH, 1);
 		so_data  (so, rt[3]->pitch);
 	}
 
 	if (zeta_format) {
-		so_method(so, nv40->screen->curie, NV40TCL_DMA_ZETA, 1);
-		so_reloc (so, nv40_surface_buffer(&zeta->base), 0, rt_flags | NOUVEAU_BO_OR,
-			  nv40->nvws->channel->vram->handle,
-			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->screen->curie, NV40TCL_ZETA_OFFSET, 1);
-		so_reloc (so, nv40_surface_buffer(&zeta->base), zeta->base.offset, rt_flags |
-			  NOUVEAU_BO_LOW, 0, 0);
-		so_method(so, nv40->screen->curie, NV40TCL_ZETA_PITCH, 1);
+		so_method(so, curie, NV40TCL_DMA_ZETA, 1);
+		so_reloc (so, nv40_surface_buffer(&zeta->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, curie, NV40TCL_ZETA_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(&zeta->base),
+			      zeta->base.offset, rt_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, curie, NV40TCL_ZETA_PITCH, 1);
 		so_data  (so, zeta->pitch);
 	}
 
-	so_method(so, nv40->screen->curie, NV40TCL_RT_ENABLE, 1);
+	so_method(so, curie, NV40TCL_RT_ENABLE, 1);
 	so_data  (so, rt_enable);
-	so_method(so, nv40->screen->curie, NV40TCL_RT_HORIZ, 3);
+	so_method(so, curie, NV40TCL_RT_HORIZ, 3);
 	so_data  (so, (w << 16) | 0);
 	so_data  (so, (h << 16) | 0);
 	so_data  (so, rt_format);
-	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_method(so, curie, NV40TCL_VIEWPORT_HORIZ, 2);
 	so_data  (so, (w << 16) | 0);
 	so_data  (so, (h << 16) | 0);
-	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_method(so, curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
 	so_data  (so, ((w - 1) << 16) | 0);
 	so_data  (so, ((h - 1) << 16) | 0);
-	so_method(so, nv40->screen->curie, 0x1d88, 1);
+	so_method(so, curie, 0x1d88, 1);
 	so_data  (so, (1 << 12) | h);
 
 	so_ref(so, &nv40->state.hw[NV40_STATE_FB]);
diff --git a/src/gallium/drivers/nv40/nv40_surface.c b/src/gallium/drivers/nv40/nv40_surface.c
index 1a849da32d..a596547974 100644
--- a/src/gallium/drivers/nv40/nv40_surface.c
+++ b/src/gallium/drivers/nv40/nv40_surface.c
@@ -26,12 +26,13 @@
  * 
  **************************************************************************/
 
-#include "nv40_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_inlines.h"
+
 #include "util/u_tile.h"
 
+#include "nv40_context.h"
+
 static void
 nv40_surface_copy(struct pipe_context *pipe,
 		  struct pipe_surface *dest, unsigned destx, unsigned desty,
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
index f3518b2e4f..b2753b8e2e 100644
--- a/src/gallium/drivers/nv40/nv40_vbo.c
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -1,5 +1,6 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "nv40_context.h"
 #include "nv40_state.h"
@@ -70,7 +71,7 @@ static boolean
 nv40_vbo_set_idxbuf(struct nv40_context *nv40, struct pipe_buffer *ib,
 		    unsigned ib_size)
 {
-	struct pipe_screen *pscreen = &nv40->screen->pipe;
+	struct pipe_screen *pscreen = &nv40->screen->base.base;
 	unsigned type;
 
 	if (!ib) {
@@ -108,7 +109,7 @@ nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
 		       int attrib, struct pipe_vertex_element *ve,
 		       struct pipe_vertex_buffer *vb)
 {
-	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct pipe_screen *pscreen = nv40->pipe.screen;
 	struct nouveau_grobj *curie = nv40->screen->curie;
 	unsigned type, ncomp;
 	void *map;
@@ -116,7 +117,7 @@ nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
 	if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp))
 		return FALSE;
 
-	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map  = pipe_buffer_map(pscreen, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
 	map += vb->buffer_offset + ve->src_offset;
 
 	switch (type) {
@@ -148,17 +149,17 @@ nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
 			so_data  (so, fui(v[0]));
 			break;
 		default:
-			ws->buffer_unmap(ws, vb->buffer);
+			pipe_buffer_unmap(pscreen, vb->buffer);
 			return FALSE;
 		}
 	}
 		break;
 	default:
-		ws->buffer_unmap(ws, vb->buffer);
+		pipe_buffer_unmap(pscreen, vb->buffer);
 		return FALSE;
 	}
 
-	ws->buffer_unmap(ws, vb->buffer);
+	pipe_buffer_unmap(pscreen, vb->buffer);
 
 	return TRUE;
 }
@@ -168,7 +169,7 @@ nv40_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->nvws->channel;
+	struct nouveau_channel *chan = nv40->screen->base.channel;
 	unsigned restart;
 
 	nv40_vbo_set_idxbuf(nv40, NULL, 0);
@@ -227,7 +228,7 @@ static INLINE void
 nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->nvws->channel;
+	struct nouveau_channel *chan = nv40->screen->base.channel;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -276,7 +277,7 @@ static INLINE void
 nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->nvws->channel;
+	struct nouveau_channel *chan = nv40->screen->base.channel;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -325,7 +326,7 @@ static INLINE void
 nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->nvws->channel;
+	struct nouveau_channel *chan = nv40->screen->base.channel;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -367,10 +368,10 @@ nv40_draw_elements_inline(struct pipe_context *pipe,
 			  unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct pipe_winsys *ws = pipe->winsys;
+	struct pipe_screen *pscreen = pipe->screen;
 	void *map;
 
-	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	map = pipe_buffer_map(pscreen, ib, PIPE_BUFFER_USAGE_CPU_READ);
 	if (!ib) {
 		NOUVEAU_ERR("failed mapping ib\n");
 		return FALSE;
@@ -391,7 +392,7 @@ nv40_draw_elements_inline(struct pipe_context *pipe,
 		break;
 	}
 
-	ws->buffer_unmap(ws, ib);
+	pipe_buffer_unmap(pscreen, ib);
 	return TRUE;
 }
 
@@ -400,7 +401,7 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->nvws->channel;
+	struct nouveau_channel *chan = nv40->screen->base.channel;
 	unsigned restart;
 
 	while (count) {
@@ -519,17 +520,20 @@ nv40_vbo_validate(struct nv40_context *nv40)
 			return FALSE;
 		}
 
-		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
-			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-			 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
+		so_reloc(vtxbuf, nouveau_bo(vb->buffer),
+				 vb->buffer_offset + ve->src_offset,
+				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
 		so_data (vtxfmt, ((vb->stride << NV40TCL_VTXFMT_STRIDE_SHIFT) |
 				  (ncomp << NV40TCL_VTXFMT_SIZE_SHIFT) | type));
 	}
 
 	if (ib) {
+		struct nouveau_bo *bo = nouveau_bo(ib);
+
 		so_method(vtxbuf, curie, NV40TCL_IDXBUF_ADDRESS, 2);
-		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
-		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+		so_reloc (vtxbuf, bo, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, bo, ib_format, vb_flags | NOUVEAU_BO_OR,
 			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
 	}
 
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
index 7df9a4d326..e75e8d3f42 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -1,6 +1,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
@@ -855,8 +856,7 @@ out_err:
 static boolean
 nv40_vertprog_validate(struct nv40_context *nv40)
 { 
-	struct nouveau_winsys *nvws = nv40->nvws;
-	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct pipe_screen *pscreen = nv40->pipe.screen;
 	struct nouveau_grobj *curie = nv40->screen->curie;
 	struct nv40_vertex_program *vp;
 	struct pipe_buffer *constbuf;
@@ -895,15 +895,15 @@ check_gpu_resources:
 		struct nouveau_stateobj *so;
 		uint vplen = vp->nr_insns;
 
-		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+		if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
 			while (heap->next && heap->size < vplen) {
 				struct nv40_vertex_program *evict;
 				
 				evict = heap->next->priv;
-				nvws->res_free(&evict->exec);
+				nouveau_resource_free(&evict->exec);
 			}
 
-			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
 				assert(0);
 		}
 
@@ -925,15 +925,15 @@ check_gpu_resources:
 	if (vp->nr_consts && !vp->data) {
 		struct nouveau_resource *heap = nv40->screen->vp_data_heap;
 
-		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+		if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
 			while (heap->next && heap->size < vp->nr_consts) {
 				struct nv40_vertex_program *evict;
 				
 				evict = heap->next->priv;
-				nvws->res_free(&evict->data);
+				nouveau_resource_free(&evict->data);
 			}
 
-			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+			if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
 				assert(0);
 		}
 
@@ -981,8 +981,8 @@ check_gpu_resources:
 		float *map = NULL;
 
 		if (constbuf) {
-			map = ws->buffer_map(ws, constbuf,
-					     PIPE_BUFFER_USAGE_CPU_READ);
+			map = pipe_buffer_map(pscreen, constbuf,
+					      PIPE_BUFFER_USAGE_CPU_READ);
 		}
 
 		for (i = 0; i < vp->nr_consts; i++) {
@@ -1003,7 +1003,7 @@ check_gpu_resources:
 		}
 
 		if (constbuf)
-			ws->buffer_unmap(ws, constbuf);
+			pscreen->buffer_unmap(pscreen, constbuf);
 	}
 
 	/* Upload vtxprog */
@@ -1035,8 +1035,6 @@ check_gpu_resources:
 void
 nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
 {
-	struct nouveau_winsys *nvws = nv40->screen->nvws;
-
 	vp->translated = FALSE;
 
 	if (vp->nr_insns) {
@@ -1051,9 +1049,9 @@ nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
 		vp->nr_consts = 0;
 	}
 
-	nvws->res_free(&vp->exec);
+	nouveau_resource_free(&vp->exec);
 	vp->exec_start = 0;
-	nvws->res_free(&vp->data);
+	nouveau_resource_free(&vp->data);
 	vp->data_start = 0;
 	vp->data_start_min = 0;
 
diff --git a/src/gallium/drivers/nv50/nv50_clear.c b/src/gallium/drivers/nv50/nv50_clear.c
index 33427a15a5..e0b2d2880b 100644
--- a/src/gallium/drivers/nv50/nv50_clear.c
+++ b/src/gallium/drivers/nv50/nv50_clear.c
@@ -31,7 +31,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
 	   const float *rgba, double depth, unsigned stencil)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
 	unsigned mode = 0, i;
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index a511f655c1..e02afc4be9 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -33,7 +33,7 @@ nv50_flush(struct pipe_context *pipe, unsigned flags,
 {
 	struct nv50_context *nv50 = (struct nv50_context *)pipe;
 	
-	FIRE_RING(nv50->screen->nvws->channel);
+	FIRE_RING(nv50->screen->base.channel);
 }
 
 static void
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 7b67a75439..9b8cc4d37d 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -63,6 +63,11 @@ struct nv50_rasterizer_stateobj {
 	struct nouveau_stateobj *so;
 };
 
+struct nv50_sampler_stateobj {
+	bool normalized;
+	unsigned tsc[8];
+};
+
 struct nv50_miptree_level {
 	int *image_offset;
 	unsigned pitch;
@@ -70,7 +75,8 @@ struct nv50_miptree_level {
 
 struct nv50_miptree {
 	struct pipe_texture base;
-	struct pipe_buffer *buffer;
+
+	struct nouveau_bo *bo;
 
 	struct nv50_miptree_level level[PIPE_MAX_TEXTURE_LEVELS];
 	int image_nr;
@@ -93,13 +99,6 @@ nv50_surface(struct pipe_surface *pt)
 	return (struct nv50_surface *)pt;
 }
 
-static INLINE struct pipe_buffer *
-nv50_surface_buffer(struct pipe_surface *surface)
-{
-	struct nv50_miptree *mt = (struct nv50_miptree *)surface->texture;
-	return mt->buffer;
-}
-
 struct nv50_state {
 	unsigned dirty;
 
@@ -115,6 +114,7 @@ struct nv50_state {
 	unsigned viewport_bypass;
 	struct nouveau_stateobj *tsc_upload;
 	struct nouveau_stateobj *tic_upload;
+	unsigned miptree_nr;
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
 	struct nouveau_stateobj *vtxfmt;
@@ -147,7 +147,7 @@ struct nv50_context {
 	unsigned vtxbuf_nr;
 	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
 	unsigned vtxelt_nr;
-	unsigned *sampler[PIPE_MAX_SAMPLERS];
+	struct nv50_sampler_stateobj *sampler[PIPE_MAX_SAMPLERS];
 	unsigned sampler_nr;
 	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
 	unsigned miptree_nr;
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index f79a7ca86c..6b605ba416 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -29,26 +29,35 @@
 static struct pipe_texture *
 nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 {
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
 	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
 	struct pipe_texture *pt = &mt->base;
-	unsigned usage, width = tmp->width[0], height = tmp->height[0];
+	unsigned width = tmp->width[0], height = tmp->height[0];
 	unsigned depth = tmp->depth[0];
-	int i, l;
+	uint32_t tile_mode, tile_flags, tile_h;
+	int ret, i, l;
 
 	mt->base = *tmp;
 	pipe_reference_init(&mt->base.reference, 1);
 	mt->base.screen = pscreen;
 
-	usage = PIPE_BUFFER_USAGE_PIXEL;
 	switch (pt->format) {
 	case PIPE_FORMAT_Z24S8_UNORM:
 	case PIPE_FORMAT_Z16_UNORM:
-		usage |= NOUVEAU_BUFFER_USAGE_ZETA;
+		tile_flags = 0x2800;
 		break;
 	default:
+		tile_flags = 0x7000;
 		break;
 	}
 
+	if      (pt->height[0] > 32) tile_mode = 4;
+	else if (pt->height[0] > 16) tile_mode = 3;
+	else if (pt->height[0] >  8) tile_mode = 2;
+	else if (pt->height[0] >  4) tile_mode = 1;
+	else                         tile_mode = 0;
+	tile_h = 1 << (tile_mode + 2);
+
 	switch (pt->target) {
 	case PIPE_TEXTURE_3D:
 		mt->image_nr = pt->depth[0];
@@ -85,7 +94,7 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 
 			size  = align(pt->width[l], 8) * pt->block.size;
 			size  = align(size, 64);
-			size *= align(pt->height[l], 8) * pt->block.size;
+			size *= align(pt->height[l], tile_h) * pt->block.size;
 
 			lvl->image_offset[i] = mt->total_size;
 
@@ -93,12 +102,13 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 		}
 	}
 
-	mt->buffer = pscreen->buffer_create(pscreen, 256, usage, mt->total_size);
-	if (!mt->buffer) {
+	ret = nouveau_bo_new_tile(dev, NOUVEAU_BO_VRAM, 256, mt->total_size,
+				  tile_mode, tile_flags, &mt->bo);
+	if (ret) {
 		FREE(mt);
 		return NULL;
 	}
-
+			     
 	return &mt->base;
 }
 
@@ -106,6 +116,7 @@ static struct pipe_texture *
 nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 		     const unsigned *stride, struct pipe_buffer *pb)
 {
+	struct nouveau_bo *bo = nouveau_bo(pb);
 	struct nv50_miptree *mt;
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
@@ -124,7 +135,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->level[0].pitch = *stride;
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
 
-	pipe_buffer_reference(&mt->buffer, pb);
+	nouveau_bo_ref(bo, &mt->bo);
 	return &mt->base;
 }
 
@@ -133,7 +144,7 @@ nv50_miptree_destroy(struct pipe_texture *pt)
 {
 	struct nv50_miptree *mt = nv50_miptree(pt);
 
-        pipe_buffer_reference(&mt->buffer, NULL);
+	nouveau_bo_ref(NULL, &mt->bo);
         FREE(mt);
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 2d15868ae8..5f7d06dbec 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -85,6 +85,9 @@ struct nv50_reg {
 
 	int hw;
 	int neg;
+
+	int rhw; /* result hw for FP outputs, or interpolant index */
+	int acc; /* instruction where this reg is last read (first insn == 1) */
 };
 
 struct nv50_pc {
@@ -108,12 +111,23 @@ struct nv50_pc {
 
 	struct nv50_reg *temp_temp[16];
 	unsigned temp_temp_nr;
+
+	unsigned interp_mode[32];
+	/* perspective interpolation registers */
+	struct nv50_reg *iv_p;
+	struct nv50_reg *iv_c;
+
+	/* current instruction and total number of insns */
+	unsigned insn_cur;
+	unsigned insn_nr;
+
+	boolean allow32;
 };
 
 static void
 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 {
-	int i;
+	int i = 0;
 
 	if (reg->type == P_RESULT) {
 		if (pc->p->cfg.high_result < (reg->hw + 1))
@@ -131,7 +145,22 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 		return;
 	}
 
-	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+	if (reg->rhw != -1) {
+		/* try to allocate temporary with index rhw first */
+		if (!(pc->r_temp[reg->rhw])) {
+			pc->r_temp[reg->rhw] = reg;
+			reg->hw = reg->rhw;
+			if (pc->p->cfg.high_temp < (reg->rhw + 1))
+				pc->p->cfg.high_temp = reg->rhw + 1;
+			return;
+		}
+		/* make sure we don't get things like $r0 needs to go
+		 * in $r1 and $r1 in $r0
+		 */
+		i = pc->result_nr * 4;
+	}
+
+	for (; i < NV50_SU_MAX_TEMP; i++) {
 		if (!(pc->r_temp[i])) {
 			pc->r_temp[i] = reg;
 			reg->hw = i;
@@ -159,6 +188,7 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 			r->type = P_TEMP;
 			r->index = -1;
 			r->hw = i;
+			r->rhw = -1;
 			pc->r_temp[i] = r;
 			return r;
 		}
@@ -168,6 +198,38 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 	return NULL;
 }
 
+/* Assign the hw of the discarded temporary register src
+ * to the tgsi register dst and free src.
+ */
+static void
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	assert(src->index == -1 && src->hw != -1);
+
+	if (dst->hw != -1)
+		pc->r_temp[dst->hw] = NULL;
+	pc->r_temp[src->hw] = dst;
+	dst->hw = src->hw;
+
+	FREE(src);
+}
+
+/* release the hardware resource held by r */
+static void
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	assert(r->type == P_TEMP);
+	if (r->hw == -1)
+		return;
+
+	assert(pc->r_temp[r->hw] == r);
+	pc->r_temp[r->hw] = NULL;
+
+	r->acc = 0;
+	if (r->index == -1)
+		FREE(r);
+}
+
 static void
 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
 {
@@ -250,7 +312,13 @@ alloc_immd(struct nv50_pc *pc, float f)
 	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
 	unsigned hw;
 
-	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
+	for (hw = 0; hw < pc->immd_nr * 4; hw++)
+		if (pc->immd_buf[hw] == f)
+			break;
+
+	if (hw == pc->immd_nr * 4)
+		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
+
 	r->type = P_IMMD;
 	r->hw = hw;
 	r->index = -1;
@@ -341,7 +409,8 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
-	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+	float f = pc->immd_buf[imm->hw];
+	unsigned val = fui(imm->neg ? -f : f);
 
 	set_long(pc, e);
 	/*XXX: can't be predicated - bits overlap.. catch cases where both
@@ -354,20 +423,35 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 	e->inst[1] |= (val >> 6) << 2;
 }
 
+
+#define INTERP_LINEAR		0
+#define INTERP_FLAT			1
+#define INTERP_PERSPECTIVE	2
+#define INTERP_CENTROID		4
+
+/* interpolant index has been stored in dst->rhw */
 static void
-emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
-	    struct nv50_reg *src, struct nv50_reg *iv)
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
+		unsigned mode)
 {
+	assert(dst->rhw != -1);
 	struct nv50_program_exec *e = exec(pc);
 
 	e->inst[0] |= 0x80000000;
 	set_dst(pc, dst, e);
-	alloc_reg(pc, src);
-	e->inst[0] |= (src->hw << 16);
-	if (iv) {
-		e->inst[0] |= (1 << 25);
-		alloc_reg(pc, iv);
-		e->inst[0] |= (iv->hw << 9);
+	e->inst[0] |= (dst->rhw << 16);
+
+	if (mode & INTERP_FLAT) {
+		e->inst[0] |= (1 << 8);
+	} else {
+		if (mode & INTERP_PERSPECTIVE) {
+			e->inst[0] |= (1 << 25);
+			alloc_reg(pc, iv);
+			e->inst[0] |= (iv->hw << 9);
+		}
+
+		if (mode & INTERP_CENTROID)
+			e->inst[0] |= (1 << 24);
 	}
 
 	emit(pc, e);
@@ -378,22 +462,12 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
 	 struct nv50_program_exec *e)
 {
 	set_long(pc, e);
-#if 1
-	e->inst[1] |= (1 << 22);
-#else
-	if (src->type == P_IMMD) {
-		e->inst[1] |= (NV50_CB_PMISC << 22);
-	} else {
-		if (pc->p->type == PIPE_SHADER_VERTEX)
-			e->inst[1] |= (NV50_CB_PVP << 22);
-		else
-			e->inst[1] |= (NV50_CB_PFP << 22);
-	}
-#endif
 
 	e->param.index = src->hw;
 	e->param.shift = s;
 	e->param.mask = m << (s % 32);
+
+	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
 }
 
 static void
@@ -405,12 +479,11 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
 	set_dst(pc, dst, e);
 
-	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
 		set_immd(pc, src, e);
 		/*XXX: 32-bit, but steals part of "half" reg space - need to
 		 *     catch and handle this case if/when we do half-regs
 		 */
-		e->inst[0] |= 0x00008000;
 	} else
 	if (src->type == P_IMMD || src->type == P_CONST) {
 		set_long(pc, e);
@@ -426,18 +499,25 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 		e->inst[0] |= (src->hw << 9);
 	}
 
-	/* We really should support "half" instructions here at some point,
-	 * but I don't feel confident enough about them yet.
-	 */
-	set_long(pc, e);
 	if (is_long(e) && !is_immd(e)) {
 		e->inst[1] |= 0x04000000; /* 32-bit */
-		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
-	}
+		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
+		if (!(e->inst[1] & 0x20000000))
+			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
+	} else
+		e->inst[0] |= 0x00008000;
 
 	emit(pc, e);
 }
 
+static INLINE void
+emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
+{
+	struct nv50_reg *imm = alloc_immd(pc, f);
+	emit_mov(pc, dst, imm);
+	FREE(imm);
+}
+
 static boolean
 check_swap_src_0_1(struct nv50_pc *pc,
 		   struct nv50_reg **s0, struct nv50_reg **s1)
@@ -541,12 +621,26 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	struct nv50_program_exec *e = exec(pc);
 
 	e->inst[0] |= 0xc0000000;
-	set_long(pc, e);
+
+	if (!pc->allow32)
+		set_long(pc, e);
 
 	check_swap_src_0_1(pc, &src0, &src1);
 	set_dst(pc, dst, e);
 	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
+	if (src1->type == P_IMMD && !is_long(e)) {
+		if (src0->neg)
+			e->inst[0] |= 0x00008000;
+		set_immd(pc, src1, e);
+	} else {
+		set_src_1(pc, src1, e);
+		if (src0->neg ^ src1->neg) {
+			if (is_long(e))
+				e->inst[1] |= 0x08000000;
+			else
+				e->inst[0] |= 0x00008000;
+		}
+	}
 
 	emit(pc, e);
 }
@@ -560,11 +654,20 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
 	e->inst[0] |= 0xb0000000;
 
 	check_swap_src_0_1(pc, &src0, &src1);
+
+	if (!pc->allow32 || src0->neg || src1->neg) {
+		set_long(pc, e);
+		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
+	}
+
 	set_dst(pc, dst, e);
 	set_src_0(pc, src0, e);
-	if (is_long(e))
+	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
 		set_src_2(pc, src1, e);
 	else
+	if (src1->type == P_IMMD)
+		set_immd(pc, src1, e);
+	else
 		set_src_1(pc, src1, e);
 
 	emit(pc, e);
@@ -588,25 +691,13 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0xb0000000;
-
-	set_long(pc, e);
-	if (check_swap_src_0_1(pc, &src0, &src1))
-		e->inst[1] |= 0x04000000;
-	else
-		e->inst[1] |= 0x08000000;
-
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	set_src_2(pc, src1, e);
-
-	emit(pc, e);
+	src1->neg ^= 1;
+	emit_add(pc, dst, src0, src1);
+	src1->neg ^= 1;
 }
 
 static void
@@ -623,26 +714,21 @@ emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	set_src_1(pc, src1, e);
 	set_src_2(pc, src2, e);
 
+	if (src0->neg ^ src1->neg)
+		e->inst[1] |= 0x04000000;
+	if (src2->neg)
+		e->inst[1] |= 0x08000000;
+
 	emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0xe0000000;
-	set_long(pc, e);
-	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
-
-	check_swap_src_0_1(pc, &src0, &src1);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
-	set_src_2(pc, src2, e);
-
-	emit(pc, e);
+	src2->neg ^= 1;
+	emit_mad(pc, dst, src0, src1, src2);
+	src2->neg ^= 1;
 }
 
 static void
@@ -693,6 +779,48 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit(pc, e);
 }
 
+#define CVTOP_RN	0x01
+#define CVTOP_FLOOR	0x03
+#define CVTOP_CEIL	0x05
+#define CVTOP_TRUNC	0x07
+#define CVTOP_SAT	0x08
+#define CVTOP_ABS	0x10
+
+#define CVT_F32_F32 0xc4
+#define CVT_F32_S32 0x44
+#define CVT_F32_U32 0x64
+#define CVT_S32_F32 0x8c
+#define CVT_S32_S32 0x0c
+#define CVT_F32_F32_ROP 0xcc
+
+static void
+emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+	 int wp, unsigned cop, unsigned fmt)
+{
+	struct nv50_program_exec *e;
+
+	e = exec(pc);
+	set_long(pc, e);
+
+	e->inst[0] |= 0xa0000000;
+	e->inst[1] |= 0x00004000;
+	e->inst[1] |= (cop << 16);
+	e->inst[1] |= (fmt << 24);
+	set_src_0(pc, src, e);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
+	emit(pc, e);
+}
+
 static void
 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
 	 struct nv50_reg *src0, struct nv50_reg *src1)
@@ -736,22 +864,10 @@ emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
 		free_temp(pc, dst);
 }
 
-static void
+static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xa0000000; /* cvt */
-	set_long(pc, e);
-	e->inst[1] |= (6 << 29); /* cvt */
-	e->inst[1] |= 0x08000000; /* integer mode */
-	e->inst[1] |= 0x04000000; /* 32 bit */
-	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
-	e->inst[1] |= (1 << 14); /* src .f32 */
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-
-	emit(pc, e);
+	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
 }
 
 static void
@@ -768,21 +884,10 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
 	free_temp(pc, temp);
 }
 
-static void
+static INLINE void
 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xa0000000; /* cvt */
-	set_long(pc, e);
-	e->inst[1] |= (6 << 29); /* cvt */
-	e->inst[1] |= 0x04000000; /* 32 bit */
-	e->inst[1] |= (1 << 14); /* src .f32 */
-	e->inst[1] |= ((1 << 6) << 14); /* .abs */
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-
-	emit(pc, e);
+	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
 static void
@@ -794,18 +899,12 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
 	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
 	struct nv50_reg *tmp[4];
+	boolean allow32 = pc->allow32;
 
-	if (mask & (1 << 0))
-		emit_mov(pc, dst[0], one);
-
-	if (mask & (1 << 3))
-		emit_mov(pc, dst[3], one);
+	pc->allow32 = FALSE;
 
 	if (mask & (3 << 1)) {
-		if (mask & (1 << 1))
-			tmp[0] = dst[1];
-		else
-			tmp[0] = temp_temp(pc);
+		tmp[0] = alloc_temp(pc, NULL);
 		emit_minmax(pc, 4, tmp[0], src[0], zero);
 	}
 
@@ -823,6 +922,26 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 		emit_mov(pc, dst[2], zero);
 		set_pred(pc, 3, 0, pc->p->exec_tail);
 	}
+
+	if (mask & (1 << 1))
+		assimilate_temp(pc, dst[1], tmp[0]);
+	else
+	if (mask & (1 << 2))
+		free_temp(pc, tmp[0]);
+
+	pc->allow32 = allow32;
+
+	/* do this last, in case src[i,j] == dst[0,3] */
+	if (mask & (1 << 0))
+		emit_mov(pc, dst[0], one);
+
+	if (mask & (1 << 3))
+		emit_mov(pc, dst[3], one);
+
+	FREE(pos128);
+	FREE(neg128);
+	FREE(zero);
+	FREE(one);
 }
 
 static void
@@ -853,6 +972,8 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 	e->inst[1] = 0xc4014788;
 	set_src_0(pc, src, e);
 	set_pred_wr(pc, 1, r_pred, e);
+	if (src->neg)
+		e->inst[1] |= 0x20000000;
 	emit(pc, e);
 
 	/* This is probably KILP */
@@ -863,6 +984,180 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 	emit(pc, e);
 }
 
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
+{
+	struct nv50_reg *temp, *t[4];
+	struct nv50_program_exec *e;
+
+	unsigned c, mode, dim;
+
+	switch (type) {
+	case TGSI_TEXTURE_1D:
+		dim = 1;
+		break;
+	case TGSI_TEXTURE_UNKNOWN:
+	case TGSI_TEXTURE_2D:
+	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
+	case TGSI_TEXTURE_RECT:
+		dim = 2;
+		break;
+	case TGSI_TEXTURE_3D:
+	case TGSI_TEXTURE_CUBE:
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
+		dim = 3;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	alloc_temp4(pc, t, 0);
+
+	if (proj) {
+		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+			mode = pc->interp_mode[src[0]->index];
+
+			t[3]->rhw = src[3]->rhw;
+			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+			emit_flop(pc, 0, t[3], t[3]);
+
+			for (c = 0; c < dim; c++) {
+				t[c]->rhw = src[c]->rhw;
+				emit_interp(pc, t[c], t[3],
+					    (mode | INTERP_PERSPECTIVE));
+			}
+		} else {
+			emit_flop(pc, 0, t[3], src[3]);
+			for (c = 0; c < dim; c++)
+				emit_mul(pc, t[c], src[c], t[3]);
+
+			/* XXX: for some reason the blob sometimes uses MAD:
+			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
+			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
+			 */
+		}
+	} else {
+		if (type == TGSI_TEXTURE_CUBE) {
+			temp = temp_temp(pc);
+			emit_minmax(pc, 4, temp, src[0], src[1]);
+			emit_minmax(pc, 4, temp, temp, src[2]);
+			emit_flop(pc, 0, temp, temp);
+			for (c = 0; c < 3; c++)
+				emit_mul(pc, t[c], src[c], temp);
+		} else {
+			for (c = 0; c < dim; c++)
+				emit_mov(pc, t[c], src[c]);
+		}
+	}
+
+	e = exec(pc);
+	set_long(pc, e);
+	e->inst[0] |= 0xf0000000;
+	e->inst[1] |= 0x00000004;
+	set_dst(pc, t[0], e);
+	e->inst[0] |= (unit << 9);
+
+	if (dim == 2)
+		e->inst[0] |= 0x00400000;
+	else
+	if (dim == 3)
+		e->inst[0] |= 0x00800000;
+
+	e->inst[0] |= (mask & 0x3) << 25;
+	e->inst[1] |= (mask & 0xc) << 12;
+
+	emit(pc, e);
+
+#if 1
+	if (mask & 1) emit_mov(pc, dst[0], t[0]);
+	if (mask & 2) emit_mov(pc, dst[1], t[1]);
+	if (mask & 4) emit_mov(pc, dst[2], t[2]);
+	if (mask & 8) emit_mov(pc, dst[3], t[3]);
+
+	free_temp4(pc, t);
+#else
+	/* XXX: if p.e. MUL is used directly after TEX, it would still use
+	 * the texture coordinates, not the fetched values: latency ? */
+
+	for (c = 0; c < 4; c++) {
+		if (mask & (1 << c))
+			assimilate_temp(pc, dst[c], t[c]);
+		else
+			free_temp(pc, t[c]);
+	}
+#endif
+}
+
+static void
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	unsigned q = 0, m = ~0;
+
+	assert(!is_long(e));
+
+	switch (e->inst[0] >> 28) {
+	case 0x1:
+		/* MOV */
+		q = 0x0403c000;
+		m = 0xffff7fff;
+		break;
+	case 0x8:
+		/* INTERP */
+		m = ~0x02000000;
+		if (e->inst[0] & 0x02000000)
+			q = 0x00020000;
+		break;
+	case 0x9:
+		/* RCP */
+		break;
+	case 0xB:
+		/* ADD */
+		m = ~(127 << 16);
+		q = ((e->inst[0] & (~m)) >> 2);
+		break;
+	case 0xC:
+		/* MUL */
+		m = ~0x00008000;
+		q = ((e->inst[0] & (~m)) << 12);
+		break;
+	case 0xE:
+		/* MAD (if src2 == dst) */
+		q = ((e->inst[0] & 0x1fc) << 12);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	set_long(pc, e);
+	pc->p->exec_size++;
+
+	e->inst[0] &= m;
+	e->inst[1] |= q;
+}
+
+static boolean
+negate_supported(const struct tgsi_full_instruction *insn, int i)
+{
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_DP3:
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_MUL:
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_ADD:
+	case TGSI_OPCODE_SUB:
+	case TGSI_OPCODE_MAD:
+		return TRUE;
+	case TGSI_OPCODE_POW:
+		return (i == 1) ? TRUE : FALSE;
+	default:
+		return FALSE;
+	}
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -881,11 +1176,14 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 }
 
 static struct nv50_reg *
-tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
+	 boolean neg)
 {
 	struct nv50_reg *r = NULL;
 	struct nv50_reg *temp;
-	unsigned c;
+	unsigned sgn, c;
+
+	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
 
 	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
 	switch (c) {
@@ -915,16 +1213,17 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
 		break;
 	case TGSI_EXTSWIZZLE_ZERO:
 		r = alloc_immd(pc, 0.0);
-		break;
+		return r;
 	case TGSI_EXTSWIZZLE_ONE:
-		r = alloc_immd(pc, 1.0);
-		break;
+		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
+			return alloc_immd(pc, -1.0);
+		return alloc_immd(pc, 1.0);
 	default:
 		assert(0);
 		break;
 	}
 
-	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+	switch (sgn) {
 	case TGSI_UTIL_SIGN_KEEP:
 		break;
 	case TGSI_UTIL_SIGN_CLEAR:
@@ -933,14 +1232,21 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
 		r = temp;
 		break;
 	case TGSI_UTIL_SIGN_TOGGLE:
-		temp = temp_temp(pc);
-		emit_neg(pc, temp, r);
-		r = temp;
+		if (neg)
+			r->neg = 1;
+		else {
+			temp = temp_temp(pc);
+			emit_neg(pc, temp, r);
+			r = temp;
+		}
 		break;
 	case TGSI_UTIL_SIGN_SET:
 		temp = temp_temp(pc);
 		emit_abs(pc, temp, r);
-		emit_neg(pc, temp, r);
+		if (neg)
+			temp->neg = 1;
+		else
+			emit_neg(pc, temp, temp);
 		r = temp;
 		break;
 	default:
@@ -951,12 +1257,40 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
 	return r;
 }
 
+/* returns TRUE if instruction can overwrite sources before they're read */
+static boolean
+direct2dest_op(const struct tgsi_full_instruction *insn)
+{
+	if (insn->Instruction.Saturate)
+		return FALSE;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DP3:
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SCS:
+	case TGSI_OPCODE_SIN:
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+		return FALSE;
+	default:
+		return TRUE;
+	}
+}
+
 static boolean
 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
 	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
 	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
 	unsigned mask, sat, unit;
+	boolean assimilate = FALSE;
 	int i, c;
 
 	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
@@ -967,6 +1301,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
 		else
 			dst[c] = NULL;
+		rdst[c] = NULL;
+		src[0][c] = NULL;
+		src[1][c] = NULL;
+		src[2][c] = NULL;
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
@@ -976,7 +1314,8 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			unit = fs->SrcRegister.Index;
 
 		for (c = 0; c < 4; c++)
-			src[i][c] = tgsi_src(pc, c, fs);
+			src[i][c] = tgsi_src(pc, c, fs,
+					     negate_supported(inst, i));
 	}
 
 	if (sat) {
@@ -984,6 +1323,25 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			rdst[c] = dst[c];
 			dst[c] = temp_temp(pc);
 		}
+	} else
+	if (direct2dest_op(inst)) {
+		for (c = 0; c < 4; c++) {
+			if (!dst[c] || dst[c]->type != P_TEMP)
+				continue;
+
+			for (i = c + 1; i < 4; i++) {
+				if (dst[c] == src[0][i] ||
+				    dst[c] == src[1][i] ||
+				    dst[c] == src[2][i])
+					break;
+			}
+			if (i == 4)
+				continue;
+
+			assimilate = TRUE;
+			rdst[c] = dst[c];
+			dst[c] = alloc_temp(pc, NULL);
+		}
 	}
 
 	switch (inst->Instruction.Opcode) {
@@ -1002,7 +1360,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_COS:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, 5, temp, temp);
 		for (c = 0; c < 4; c++) {
@@ -1012,7 +1370,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_DP3:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
@@ -1021,10 +1379,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_DP4:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
@@ -1034,10 +1391,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_DPH:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
@@ -1047,7 +1403,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_DST:
 	{
@@ -1064,7 +1419,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	}
 		break;
 	case TGSI_OPCODE_EX2:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_preex2(pc, temp, src[0][0]);
 		emit_flop(pc, 6, temp, temp);
 		for (c = 0; c < 4; c++) {
@@ -1072,7 +1427,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -1082,26 +1436,26 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_FRC:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_flr(pc, temp, src[0][c]);
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_KIL:
 		emit_kil(pc, src[0][0]);
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
+		pc->p->cfg.fp.regs[2] |= 0x00100000;
 		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_flop(pc, 3, temp, src[0][0]);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
@@ -1110,15 +1464,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_LRP:
+		temp = temp_temp(pc);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			/*XXX: we can do better than this */
-			temp = alloc_temp(pc, NULL);
-			emit_neg(pc, temp, src[0][c]);
-			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
-			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
-			free_temp(pc, temp);
+			emit_sub(pc, temp, src[1][c], src[2][c]);
+			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
 		}
 		break;
 	case TGSI_OPCODE_MAD:
@@ -1157,36 +1508,39 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_POW:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_pow(pc, temp, src[0][0], src[1][0]);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_RCP:
-		for (c = 0; c < 4; c++) {
+		for (c = 3; c >= 0; c--) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_flop(pc, 0, dst[c], src[0][0]);
 		}
 		break;
 	case TGSI_OPCODE_RSQ:
-		for (c = 0; c < 4; c++) {
+		for (c = 3; c >= 0; c--) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_flop(pc, 2, dst[c], src[0][0]);
 		}
 		break;
 	case TGSI_OPCODE_SCS:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
 			emit_flop(pc, 5, dst[0], temp);
 		if (mask & (1 << 1))
 			emit_flop(pc, 4, dst[1], temp);
+		if (mask & (1 << 2))
+			emit_mov_immdval(pc, dst[2], 0.0);
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
 	case TGSI_OPCODE_SGE:
 		for (c = 0; c < 4; c++) {
@@ -1196,7 +1550,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_SIN:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, 4, temp, temp);
 		for (c = 0; c < 4; c++) {
@@ -1220,33 +1574,15 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_TEX:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->InstructionExtTexture.Texture, FALSE);
+		break;
 	case TGSI_OPCODE_TXP:
-	{
-		struct nv50_reg *t[4];
-		struct nv50_program_exec *e;
-
-		alloc_temp4(pc, t, 0);
-		emit_mov(pc, t[0], src[0][0]);
-		emit_mov(pc, t[1], src[0][1]);
-
-		e = exec(pc);
-		e->inst[0] = 0xf6400000;
-		e->inst[0] |= (unit << 9);
-		set_long(pc, e);
-		e->inst[1] |= 0x0000c004;
-		set_dst(pc, t[0], e);
-		emit(pc, e);
-
-		if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
-		if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
-		if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
-		if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
-
-		free_temp4(pc, t);
-	}
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->InstructionExtTexture.Texture, TRUE);
 		break;
 	case TGSI_OPCODE_XPD:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		if (mask & (1 << 0)) {
 			emit_mul(pc, temp, src[0][2], src[1][1]);
 			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
@@ -1259,7 +1595,8 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_mul(pc, temp, src[0][1], src[1][0]);
 			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
 		}
-		free_temp(pc, temp);
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
 	case TGSI_OPCODE_END:
 		break;
@@ -1270,21 +1607,26 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 
 	if (sat) {
 		for (c = 0; c < 4; c++) {
-			struct nv50_program_exec *e;
-
 			if (!(mask & (1 << c)))
 				continue;
-			e = exec(pc);
+			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
+				 CVT_F32_F32);
+		}
+	} else if (assimilate) {
+		for (c = 0; c < 4; c++)
+			if (rdst[c])
+				assimilate_temp(pc, rdst[c], dst[c]);
+	}
 
-			e->inst[0] = 0xa0000000; /* cvt */
-			set_long(pc, e);
-			e->inst[1] |= (6 << 29); /* cvt */
-			e->inst[1] |= 0x04000000; /* 32 bit */
-			e->inst[1] |= (1 << 14); /* src .f32 */
-			e->inst[1] |= ((1 << 5) << 14); /* .sat */
-			set_dst(pc, rdst[c], e);
-			set_src_0(pc, dst[c], e);
-			emit(pc, e);
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		for (c = 0; c < 4; c++) {
+			if (!src[i][c])
+				continue;
+			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
+				FREE(src[i][c]);
+			else
+			if (src[i][c]->acc == pc->insn_cur)
+				release_hw(pc, src[i][c]);
 		}
 	}
 
@@ -1292,12 +1634,169 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	return TRUE;
 }
 
+/* Adjust a bitmask that indicates what components of a source are used,
+ * we use this in tx_prep so we only load interpolants that are needed.
+ */
+static void
+insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
+{
+	const struct tgsi_instruction_ext_texture *tex;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_DP3:
+		*mask = 0x7;
+		break;
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+		*mask = 0xF;
+		break;
+	case TGSI_OPCODE_LIT:
+		*mask = 0xB;
+		break;
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+		*mask = 0x1;
+		break;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+		assert(insn->Instruction.Extended);
+		tex = &insn->InstructionExtTexture;
+
+		*mask = 0x7;
+		if (tex->Texture == TGSI_TEXTURE_1D)
+			*mask = 0x1;
+		else
+		if (tex->Texture == TGSI_TEXTURE_2D)
+			*mask = 0x3;
+
+		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+			*mask |= 0x8;
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
+		  unsigned *r_usage[2])
+{
+	const struct tgsi_full_instruction *insn;
+	const struct tgsi_full_src_register *src;
+	const struct tgsi_dst_register *dst;
+
+	unsigned i, c, k, n, mask, *acc_p;
+
+	insn = &tok->FullInstruction;
+	dst = &insn->FullDstRegisters[0].DstRegister;
+	mask = dst->WriteMask;
+
+	if (!r_usage[0])
+		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
+	if (!r_usage[1])
+		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+
+	if (dst->File == TGSI_FILE_TEMPORARY) {
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+		}
+	}
+
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		src = &insn->FullSrcRegisters[i];
+
+		switch (src->SrcRegister.File) {
+		case TGSI_FILE_TEMPORARY:
+			acc_p = r_usage[0];
+			break;
+		case TGSI_FILE_INPUT:
+			acc_p = r_usage[1];
+			break;
+		default:
+			continue;
+		}
+
+		insn_adjust_mask(insn, &mask);
+
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+
+			k = tgsi_util_get_full_src_register_extswizzle(src, c);
+			switch (k) {
+			case TGSI_EXTSWIZZLE_X:
+			case TGSI_EXTSWIZZLE_Y:
+			case TGSI_EXTSWIZZLE_Z:
+			case TGSI_EXTSWIZZLE_W:
+				n = src->SrcRegister.Index * 4 + k;
+				acc_p[n] = pc->insn_nr;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+}
+
+static unsigned
+load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
+	       int *aid, int *p_oid)
+{
+	struct nv50_reg *iv;
+	int oid, c, n;
+	unsigned mask = 0;
+
+	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+
+	for (c = 0, n = i * 4; c < 4; c++, n++) {
+		oid = (*p_oid)++;
+		pc->attr[n].type = P_TEMP;
+		pc->attr[n].index = i;
+
+		if (pc->attr[n].acc == acc[n])
+			continue;
+		mask |= (1 << c);
+
+		pc->attr[n].acc = acc[n];
+		pc->attr[n].rhw = pc->attr[n].hw = -1;
+		alloc_reg(pc, &pc->attr[n]);
+
+		pc->attr[n].rhw = (*aid)++;
+		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+
+		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
+		(*mid)++;
+		pc->p->cfg.fp.regs[1] += 0x00010001;
+	}
+
+	return mask;
+}
+
 static boolean
 nv50_program_tx_prep(struct nv50_pc *pc)
 {
 	struct tgsi_parse_context p;
 	boolean ret = FALSE;
 	unsigned i, c;
+	unsigned fcol, bcol, fcrd, depr;
+
+	/* count (centroid) perspective interpolations */
+	unsigned centroid_loads = 0;
+	unsigned perspect_loads = 0;
+
+	/* track register access for temps and attrs */
+	unsigned *r_usage[2];
+	r_usage[0] = NULL;
+	r_usage[1] = NULL;
+
+	depr = fcol = bcol = fcrd = 0xffff;
+
+	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+		pc->p->cfg.fp.regs[0] = 0x01000404;
+		pc->p->cfg.fp.regs[1] = 0x00000400;
+	}
 
 	tgsi_parse_init(&p, pc->p->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
@@ -1319,9 +1818,10 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		case TGSI_TOKEN_TYPE_DECLARATION:
 		{
 			const struct tgsi_full_declaration *d;
-			unsigned last;
+			unsigned last, first, mode;
 
 			d = &p.FullToken.FullDeclaration;
+			first = d->DeclarationRange.First;
 			last = d->DeclarationRange.Last;
 
 			switch (d->Declaration.File) {
@@ -1332,10 +1832,69 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			case TGSI_FILE_OUTPUT:
 				if (pc->result_nr < (last + 1))
 					pc->result_nr = last + 1;
+
+				if (!d->Declaration.Semantic)
+					break;
+
+				switch (d->Semantic.SemanticName) {
+				case TGSI_SEMANTIC_POSITION:
+					depr = first;
+					pc->p->cfg.fp.regs[2] |= 0x00000100;
+					pc->p->cfg.fp.regs[3] |= 0x00000011;
+					break;
+				default:
+					break;
+				}
+
 				break;
 			case TGSI_FILE_INPUT:
+			{
 				if (pc->attr_nr < (last + 1))
 					pc->attr_nr = last + 1;
+
+				if (pc->p->type != PIPE_SHADER_FRAGMENT)
+					break;
+
+				switch (d->Declaration.Interpolate) {
+				case TGSI_INTERPOLATE_CONSTANT:
+					mode = INTERP_FLAT;
+					break;
+				case TGSI_INTERPOLATE_PERSPECTIVE:
+					mode = INTERP_PERSPECTIVE;
+					break;
+				default:
+					mode = INTERP_LINEAR;
+					break;
+				}
+
+				if (d->Declaration.Semantic) {
+					switch (d->Semantic.SemanticName) {
+					case TGSI_SEMANTIC_POSITION:
+						fcrd = first;
+						break;
+					case TGSI_SEMANTIC_COLOR:
+						fcol = first;
+						mode = INTERP_PERSPECTIVE;
+						break;
+					case TGSI_SEMANTIC_BCOLOR:
+						bcol = first;
+						mode = INTERP_PERSPECTIVE;
+						break;
+					}
+				}
+
+				if (d->Declaration.Centroid) {
+					mode |= INTERP_CENTROID;
+					if (mode & INTERP_PERSPECTIVE)
+						centroid_loads++;
+				} else
+				if (mode & INTERP_PERSPECTIVE)
+					perspect_loads++;
+
+				assert(last < 32);
+				for (i = first; i <= last; i++)
+					pc->interp_mode[i] = mode;
+			}
 				break;
 			case TGSI_FILE_CONSTANT:
 				if (pc->param_nr < (last + 1))
@@ -1351,6 +1910,8 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		}
 			break;
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			pc->insn_nr++;
+			prep_inspect_insn(pc, tok, r_usage);
 			break;
 		default:
 			break;
@@ -1366,56 +1927,95 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			for (c = 0; c < 4; c++) {
 				pc->temp[i*4+c].type = P_TEMP;
 				pc->temp[i*4+c].hw = -1;
+				pc->temp[i*4+c].rhw = -1;
 				pc->temp[i*4+c].index = i;
+				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
 			}
 		}
 	}
 
 	if (pc->attr_nr) {
-		struct nv50_reg *iv = NULL;
-		int aid = 0;
+		int oid = 4, mid = 4, aid = 0;
+		/* oid = VP output id
+		 * aid = FP attribute/interpolant id
+		 * mid = VP output mapping field ID
+		 */
 
 		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
 		if (!pc->attr)
 			goto out_err;
 
 		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-			iv = alloc_temp(pc, NULL);
-			emit_interp(pc, iv, iv, NULL);
-			emit_flop(pc, 0, iv, iv);
-			aid++;
-		}
+			/* position should be loaded first */
+			if (fcrd != 0xffff) {
+				unsigned mask;
+				mid = 0;
+				mask = load_fp_attrib(pc, fcrd, r_usage[1],
+						      &mid, &aid, &oid);
+				oid = 0;
+				pc->p->cfg.fp.regs[1] |= (mask << 24);
+				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
+			}
+			pc->p->cfg.fp.map[0] += 0x03020100;
 
-		for (i = 0; i < pc->attr_nr; i++) {
-			struct nv50_reg *a = &pc->attr[i*4];
+			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
 
-			for (c = 0; c < 4; c++) {
-				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-					struct nv50_reg *at =
-						alloc_temp(pc, NULL);
-					pc->attr[i*4+c].type = at->type;
-					pc->attr[i*4+c].hw = at->hw;
-					pc->attr[i*4+c].index = at->index;
+			if (perspect_loads) {
+				pc->iv_p = alloc_temp(pc, NULL);
+
+				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
+					pc->p->cfg.fp.regs[1] |= 0x08000000;
+					pc->iv_p->rhw = aid++;
+					emit_interp(pc, pc->iv_p, NULL,
+						    INTERP_LINEAR);
+					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
 				} else {
-					pc->p->cfg.vp.attr[aid/32] |=
-						(1 << (aid % 32));
-					pc->attr[i*4+c].type = P_ATTR;
-					pc->attr[i*4+c].hw = aid++;
-					pc->attr[i*4+c].index = i;
+					pc->iv_p->rhw = aid - 1;
+					emit_flop(pc, 0, pc->iv_p,
+						  &pc->attr[fcrd * 4 + 3]);
 				}
 			}
 
-			if (pc->p->type != PIPE_SHADER_FRAGMENT)
-				continue;
+			if (centroid_loads) {
+				pc->iv_c = alloc_temp(pc, NULL);
+				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
+				emit_interp(pc, pc->iv_c, NULL,
+					    INTERP_CENTROID);
+				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
+				pc->p->cfg.fp.regs[1] |= 0x08000000;
+			}
 
-			emit_interp(pc, &a[0], &a[0], iv);
-			emit_interp(pc, &a[1], &a[1], iv);
-			emit_interp(pc, &a[2], &a[2], iv);
-			emit_interp(pc, &a[3], &a[3], iv);
-		}
+			for (c = 0; c < 4; c++) {
+				/* I don't know what these values do, but
+				 * let's set them like the blob does:
+				 */
+				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
+					pc->p->cfg.fp.regs[0] += 0x00010000;
+				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
+					pc->p->cfg.fp.regs[0] += 0x00010000;
+			}
 
-		if (iv)
-			free_temp(pc, iv);
+			for (i = 0; i < pc->attr_nr; i++)
+				load_fp_attrib(pc, i, r_usage[1],
+					       &mid, &aid, &oid);
+
+			if (pc->iv_p)
+				free_temp(pc, pc->iv_p);
+			if (pc->iv_c)
+				free_temp(pc, pc->iv_c);
+
+			pc->p->cfg.fp.high_map = (mid / 4);
+			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
+		} else {
+			/* vertex program */
+			for (i = 0; i < pc->attr_nr * 4; i++) {
+				pc->p->cfg.vp.attr[aid / 32] |=
+					(1 << (aid % 32));
+				pc->attr[i].type = P_ATTR;
+				pc->attr[i].hw = aid++;
+				pc->attr[i].index = i / 4;
+			}
+		}
 	}
 
 	if (pc->result_nr) {
@@ -1430,12 +2030,20 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
 					pc->result[i*4+c].type = P_TEMP;
 					pc->result[i*4+c].hw = -1;
+					pc->result[i*4+c].rhw = (i == depr) ?
+						-1 : rid++;
 				} else {
 					pc->result[i*4+c].type = P_RESULT;
 					pc->result[i*4+c].hw = rid++;
 				}
 				pc->result[i*4+c].index = i;
 			}
+
+			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
+			    depr != 0xffff) {
+				pc->result[depr * 4 + 2].rhw =
+					(pc->result_nr - 1) * 4;
+			}
 		}
 	}
 
@@ -1456,7 +2064,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 	}
 
 	if (pc->immd_nr) {
-		int rid = pc->param_nr * 4;
+		int rid = 0;
 
 		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
 		if (!pc->immd)
@@ -1473,15 +2081,38 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 
 	ret = TRUE;
 out_err:
+	if (r_usage[0])
+		FREE(r_usage[0]);
+	if (r_usage[1])
+		FREE(r_usage[1]);
+
 	tgsi_parse_free(&p);
 	return ret;
 }
 
+static void
+free_nv50_pc(struct nv50_pc *pc)
+{
+	if (pc->immd)
+		FREE(pc->immd);
+	if (pc->param)
+		FREE(pc->param);
+	if (pc->result)
+		FREE(pc->result);
+	if (pc->attr)
+		FREE(pc->attr);
+	if (pc->temp)
+		FREE(pc->temp);
+
+	FREE(pc);
+}
+
 static boolean
 nv50_program_tx(struct nv50_program *p)
 {
 	struct tgsi_parse_context parse;
 	struct nv50_pc *pc;
+	unsigned k;
 	boolean ret;
 
 	pc = CALLOC_STRUCT(nv50_pc);
@@ -1498,10 +2129,16 @@ nv50_program_tx(struct nv50_program *p)
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		const union tgsi_full_token *tok = &parse.FullToken;
 
+		/* don't allow half insn/immd on first and last instruction */
+		pc->allow32 = TRUE;
+		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
+			pc->allow32 = FALSE;
+
 		tgsi_parse_token(&parse);
 
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			++pc->insn_cur;
 			ret = nv50_program_tx_insn(pc, tok);
 			if (ret == FALSE)
 				goto out_err;
@@ -1515,8 +2152,40 @@ nv50_program_tx(struct nv50_program *p)
 		struct nv50_reg out;
 
 		out.type = P_TEMP;
-		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
-			emit_mov(pc, &out, &pc->result[out.hw]);
+		for (k = 0; k < pc->result_nr * 4; k++) {
+			if (pc->result[k].rhw == -1)
+				continue;
+			if (pc->result[k].hw != pc->result[k].rhw) {
+				out.hw = pc->result[k].rhw;
+				emit_mov(pc, &out, &pc->result[k]);
+			}
+			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
+				pc->p->cfg.high_result = pc->result[k].rhw + 1;
+		}
+	}
+
+	/* look for single half instructions and make them long */
+	struct nv50_program_exec *e, *e_prev;
+
+	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
+		if (!is_long(e))
+			k++;
+
+		if (!e->next || is_long(e->next)) {
+			if (k & 1)
+				convert_to_long(pc, e);
+			k = 0;
+		}
+
+		if (e->next)
+			e_prev = e;
+	}
+
+	if (!is_long(pc->p->exec_tail)) {
+		/* this may occur if moving FP results */
+		assert(e_prev && !is_long(e_prev));
+		convert_to_long(pc, e_prev);
+		convert_to_long(pc, pc->p->exec_tail);
 	}
 
 	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
@@ -1530,6 +2199,7 @@ out_err:
 	tgsi_parse_free(&parse);
 
 out_cleanup:
+	free_nv50_pc(pc);
 	return ret;
 }
 
@@ -1543,16 +2213,16 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
 
 static void
 nv50_program_upload_data(struct nv50_context *nv50, float *map,
-			 unsigned start, unsigned count)
+			unsigned start, unsigned count, unsigned cbuf)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	while (count) {
 		unsigned nr = count > 2047 ? 2047 : count;
 
 		BEGIN_RING(chan, tesla, 0x00000f00, 1);
-		OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
+		OUT_RING  (chan, (cbuf << 0) | (start << 8));
 		BEGIN_RING(chan, tesla, 0x40000f04, nr);
 		OUT_RINGp (chan, map, nr);
 
@@ -1565,70 +2235,93 @@ nv50_program_upload_data(struct nv50_context *nv50, float *map,
 static void
 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 {
-	struct nouveau_winsys *nvws = nv50->screen->nvws;
-	struct pipe_winsys *ws = nv50->pipe.winsys;
-	unsigned nr = p->param_nr + p->immd_nr;
+	struct pipe_screen *pscreen = nv50->pipe.screen;
 
-	if (!p->data && nr) {
-		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+	if (!p->data[0] && p->immd_nr) {
+		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
 
-		if (nvws->res_alloc(heap, nr, p, &p->data)) {
-			while (heap->next && heap->size < nr) {
+		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
+			while (heap->next && heap->size < p->immd_nr) {
 				struct nv50_program *evict = heap->next->priv;
-				nvws->res_free(&evict->data);
+				nouveau_resource_free(&evict->data[0]);
 			}
 
-			if (nvws->res_alloc(heap, nr, p, &p->data))
+			if (nouveau_resource_alloc(heap, p->immd_nr, p,
+						   &p->data[0]))
 				assert(0);
 		}
+
+		/* immediates only need to be uploaded again when freed */
+		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
+					 p->immd_nr, NV50_CB_PMISC);
 	}
 
-	if (p->param_nr) {
-		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
-					    PIPE_BUFFER_USAGE_CPU_READ);
-		nv50_program_upload_data(nv50, map, p->data->start,
-					 p->param_nr);
-		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
+	if (!p->data[1] && p->param_nr) {
+		struct nouveau_resource *heap =
+			nv50->screen->parm_heap[p->type];
+
+		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
+			while (heap->next && heap->size < p->param_nr) {
+				struct nv50_program *evict = heap->next->priv;
+				nouveau_resource_free(&evict->data[1]);
+			}
+
+			if (nouveau_resource_alloc(heap, p->param_nr, p,
+						   &p->data[1]))
+				assert(0);
+		}
 	}
 
-	if (p->immd_nr) {
-		nv50_program_upload_data(nv50, p->immd,
-					 p->data->start + p->param_nr,
-					 p->immd_nr);
+	if (p->param_nr) {
+		unsigned cbuf = NV50_CB_PVP;
+		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		if (p->type == PIPE_SHADER_FRAGMENT)
+			cbuf = NV50_CB_PFP;
+		nv50_program_upload_data(nv50, map, p->data[1]->start,
+					 p->param_nr, cbuf);
+		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
 	}
 }
 
 static void
 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct pipe_screen *screen = nv50->pipe.screen;
 	struct nv50_program_exec *e;
 	struct nouveau_stateobj *so;
 	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
 	unsigned start, count, *up, *ptr;
 	boolean upload = FALSE;
 
-	if (!p->buffer) {
-		p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
+	if (!p->bo) {
+		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
+			       p->exec_size * 4, &p->bo);
 		upload = TRUE;
 	}
 
-	if (p->data && p->data->start != p->data_start) {
+	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
+		(p->data[1] && p->data[1]->start != p->data_start[1])) {
 		for (e = p->exec_head; e; e = e->next) {
-			unsigned ei, ci;
+			unsigned ei, ci, bs;
 
 			if (e->param.index < 0)
 				continue;
+			bs = (e->inst[1] >> 22) & 0x07;
+			assert(bs < 2);
 			ei = e->param.shift >> 5;
-			ci = e->param.index + p->data->start;
+			ci = e->param.index + p->data[bs]->start;
 
 			e->inst[ei] &= ~e->param.mask;
 			e->inst[ei] |= (ci << e->param.shift);
 		}
 
-		p->data_start = p->data->start;
+		if (p->data[0])
+			p->data_start[0] = p->data[0]->start;
+		if (p->data[1])
+			p->data_start[1] = p->data[1]->start;
+
 		upload = TRUE;
 	}
 
@@ -1637,13 +2330,11 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 
 #ifdef NV50_PROGRAM_DUMP
 	NOUVEAU_ERR("-------\n");
-	up = ptr = MALLOC(p->exec_size * 4);
 	for (e = p->exec_head; e; e = e->next) {
 		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
 		if (is_long(e))
 			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
 	}
-
 #endif
 
 	up = ptr = MALLOC(p->exec_size * 4);
@@ -1655,20 +2346,20 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 
 	so = so_new(4,2);
 	so_method(so, nv50->screen->tesla, 0x1280, 3);
-	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
 
 	start = 0; count = p->exec_size;
 	while (count) {
-		struct nouveau_winsys *nvws = nv50->screen->nvws;
+		struct nouveau_channel *chan = nv50->screen->base.channel;
 		unsigned nr;
 
-		so_emit(nvws, so);
+		so_emit(chan, so);
 
 		nr = MIN2(count, 2047);
-		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
-		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
+		nr = MIN2(chan->pushbuf->remaining, nr);
+		if (chan->pushbuf->remaining < (nr + 3)) {
 			FIRE_RING(chan);
 			continue;
 		}
@@ -1704,10 +2395,10 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 
 	so = so_new(13, 2);
 	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, 0x1650, 2);
 	so_data  (so, p->cfg.vp.attr[0]);
 	so_data  (so, p->cfg.vp.attr[1]);
@@ -1728,6 +2419,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *p = nv50->fragprog;
 	struct nouveau_stateobj *so;
+	unsigned i;
 
 	if (!p->translated) {
 		nv50_program_validate(nv50, p);
@@ -1740,22 +2432,27 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 
 	so = so_new(64, 2);
 	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, 0x1904, 4);
-	so_data  (so, 0x00040404); /* p: 0x01000404 */
+	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
 	so_data  (so, 0x00000004);
 	so_data  (so, 0x00000000);
 	so_data  (so, 0x00000000);
-	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
-	so_data  (so, 0x03020100);
-	so_data  (so, 0x07060504);
-	so_data  (so, 0x0b0a0908);
+	so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
+	for (i = 0; i < p->cfg.fp.high_map; i++)
+		so_data(so, p->cfg.fp.map[i]);
 	so_method(so, tesla, 0x1988, 2);
-	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
+	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
 	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x1298, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, 0x19a8, 1);
+	so_data  (so, p->cfg.fp.regs[2]);
+	so_method(so, tesla, 0x196c, 1);
+	so_data  (so, p->cfg.fp.regs[3]);
 	so_method(so, tesla, 0x1414, 1);
 	so_data  (so, 0); /* program start offset */
 	so_ref(so, &nv50->state.fragprog);
@@ -1765,8 +2462,6 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 void
 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
-	struct pipe_screen *pscreen = nv50->pipe.screen;
-
 	while (p->exec_head) {
 		struct nv50_program_exec *e = p->exec_head;
 
@@ -1776,10 +2471,10 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 	p->exec_tail = NULL;
 	p->exec_size = 0;
 
-	if (p->buffer)
-		pipe_buffer_reference(&p->buffer, NULL);
+	nouveau_bo_ref(NULL, &p->bo);
 
-	nv50->screen->nvws->res_free(&p->data);
+	nouveau_resource_free(&p->data[0]);
+	nouveau_resource_free(&p->data[1]);
 
 	p->translated = 0;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 78deed6a38..096e0476aa 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -24,10 +24,10 @@ struct nv50_program {
 	struct nv50_program_exec *exec_head;
 	struct nv50_program_exec *exec_tail;
 	unsigned exec_size;
-	struct nouveau_resource *data;
-	unsigned data_start;
+	struct nouveau_resource *data[2];
+	unsigned data_start[2];
 
-	struct pipe_buffer *buffer;
+	struct nouveau_bo *bo;
 
 	float *immd;
 	unsigned immd_nr;
@@ -39,6 +39,11 @@ struct nv50_program {
 		struct {
 			unsigned attr[2];
 		} vp;
+		struct {
+			unsigned regs[4];
+			unsigned map[5];
+			unsigned high_map;
+		} fp;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 35cebdbdc3..940e04365f 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -26,7 +26,7 @@
 #include "nv50_context.h"
 
 struct nv50_query {
-	struct pipe_buffer *buffer;
+	struct nouveau_bo *bo;
 	unsigned type;
 	boolean ready;
 	uint64_t result;
@@ -41,14 +41,16 @@ nv50_query(struct pipe_query *pipe)
 static struct pipe_query *
 nv50_query_create(struct pipe_context *pipe, unsigned type)
 {
-	struct pipe_screen *screen = pipe->screen;
+	struct nouveau_device *dev = nouveau_screen(pipe->screen)->device;
 	struct nv50_query *q = CALLOC_STRUCT(nv50_query);
+	int ret;
 
 	assert (q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 	q->type = type;
 
-	q->buffer = screen->buffer_create(screen, 256, 0, 16);
-	if (!q->buffer) {
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 256,
+			     16, &q->bo);
+	if (ret) {
 		FREE(q);
 		return NULL;
 	}
@@ -62,7 +64,7 @@ nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
 	struct nv50_query *q = nv50_query(pq);
 
 	if (q) {
-		pipe_buffer_reference(&q->buffer, NULL);
+		nouveau_bo_ref(NULL, &q->bo);
 		FREE(q);
 	}
 }
@@ -71,7 +73,7 @@ static void
 nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_query *q = nv50_query(pq);
 
@@ -87,15 +89,14 @@ static void
 nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_query *q = nv50_query(pq);
-	struct nouveau_bo *bo = nv50->screen->nvws->get_bo(q->buffer);
 
 	WAIT_RING (chan, 5);
 	BEGIN_RING(chan, tesla, 0x1b00, 4);
-	OUT_RELOCh(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	OUT_RING  (chan, 0x00000000);
 	OUT_RING  (chan, 0x0100f002);
 	FIRE_RING (chan);
@@ -105,7 +106,6 @@ static boolean
 nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 		  boolean wait, uint64_t *result)
 {
-	struct pipe_winsys *ws = pipe->winsys;
 	struct nv50_query *q = nv50_query(pq);
 
 	/*XXX: Want to be able to return FALSE here instead of blocking
@@ -113,11 +113,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 	 */
 
 	if (!q->ready) {
-		uint32_t *map = ws->buffer_map(ws, q->buffer,
-					       PIPE_BUFFER_USAGE_CPU_READ);
-		q->result = map[1];
+		nouveau_bo_map(q->bo, NOUVEAU_BO_RD);
+		q->result = ((uint32_t *)q->bo->map)[1];
 		q->ready = TRUE;
-		ws->buffer_unmap(ws, q->buffer);
+		nouveau_bo_unmap(q->bo);
 	}
 
 	*result = q->result;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 2980564594..fd39fa738b 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -22,8 +22,6 @@
 
 #include "pipe/p_screen.h"
 
-#include "util/u_simple_screen.h"
-
 #include "nv50_context.h"
 #include "nv50_screen.h"
 
@@ -68,23 +66,6 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 	return FALSE;
 }
 
-static const char *
-nv50_screen_get_name(struct pipe_screen *pscreen)
-{
-	struct nv50_screen *screen = nv50_screen(pscreen);
-	struct nouveau_device *dev = screen->nvws->channel->device;
-	static char buffer[128];
-
-	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
-}
-
-static const char *
-nv50_screen_get_vendor(struct pipe_screen *pscreen)
-{
-	return "nouveau";
-}
-
 static int
 nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 {
@@ -153,37 +134,64 @@ nv50_screen_get_paramf(struct pipe_screen *pscreen, int param)
 static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
-	FREE(pscreen);
+	struct nv50_screen *screen = nv50_screen(pscreen);
+
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->tesla);
+	nouveau_grobj_free(&screen->eng2d);
+	nouveau_grobj_free(&screen->m2mf);
+	nouveau_screen_fini(&screen->base);
+	FREE(screen);
 }
 
 struct pipe_screen *
-nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
 	struct nouveau_stateobj *so;
-	unsigned tesla_class = 0, ret;
-	unsigned chipset = nvws->channel->device->chipset;
-	int i;
+	unsigned chipset = dev->chipset;
+	unsigned tesla_class = 0;
+	int ret, i;
 
 	if (!screen)
 		return NULL;
-	screen->nvws = nvws;
+	pscreen = &screen->base.base;
+
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv50_screen_destroy;
+	pscreen->get_param = nv50_screen_get_param;
+	pscreen->get_paramf = nv50_screen_get_paramf;
+	pscreen->is_format_supported = nv50_screen_is_format_supported;
+
+	nv50_screen_init_miptree_functions(pscreen);
+	nv50_transfer_init_screen_functions(pscreen);
 
 	/* DMA engine object */
-	ret = nvws->grobj_alloc(nvws, 0x5039, &screen->m2mf);
+	ret = nouveau_grobj_alloc(chan, 0xbeef5039, 0x5039, &screen->m2mf);
 	if (ret) {
 		NOUVEAU_ERR("Error creating M2MF object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
+	BIND_RING(chan, screen->m2mf, 1);
 
 	/* 2D object */
-	ret = nvws->grobj_alloc(nvws, NV50_2D, &screen->eng2d);
+	ret = nouveau_grobj_alloc(chan, 0xbeef502d, 0x502d, &screen->eng2d);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 2D object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
+	BIND_RING(chan, screen->eng2d, 2);
 
 	/* 3D object */
 	switch (chipset & 0xf0) {
@@ -199,70 +207,55 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 		break;
 	default:
 		NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", chipset);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
 	if (tesla_class == 0) {
 		NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	ret = nvws->grobj_alloc(nvws, tesla_class, &screen->tesla);
+	ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class, &screen->tesla);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
+	BIND_RING(chan, screen->tesla, 3);
 
 	/* Sync notifier */
-	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
-        /* Setup the pipe */
-	screen->pipe.winsys = ws;
-
-	screen->pipe.destroy = nv50_screen_destroy;
-
-	screen->pipe.get_name = nv50_screen_get_name;
-	screen->pipe.get_vendor = nv50_screen_get_vendor;
-	screen->pipe.get_param = nv50_screen_get_param;
-	screen->pipe.get_paramf = nv50_screen_get_paramf;
-
-	screen->pipe.is_format_supported = nv50_screen_is_format_supported;
-
-	nv50_screen_init_miptree_functions(&screen->pipe);
-	nv50_transfer_init_screen_functions(&screen->pipe);
-	u_simple_screen_init(&screen->pipe);
-
 	/* Static M2MF init */
 	so = so_new(32, 0);
 	so_method(so, screen->m2mf, 0x0180, 3);
 	so_data  (so, screen->sync->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_emit(nvws, so);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_emit(chan, so);
 	so_ref (NULL, &so);
 
 	/* Static 2D init */
 	so = so_new(64, 0);
 	so_method(so, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
 	so_data  (so, screen->sync->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->eng2d, NV50_2D_OPERATION, 1);
 	so_data  (so, NV50_2D_OPERATION_SRCCOPY);
 	so_method(so, screen->eng2d, 0x0290, 1);
 	so_data  (so, 0);
 	so_method(so, screen->eng2d, 0x0888, 1);
 	so_data  (so, 1);
-	so_emit(nvws, so);
+	so_emit(chan, so);
 	so_ref(NULL, &so);
 
 	/* Static tesla init */
@@ -275,11 +268,11 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
 				     NV50TCL_DMA_UNK0__SIZE);
 	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
-		so_data(so, nvws->channel->vram->handle);
+		so_data(so, chan->vram->handle);
 	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
 				     NV50TCL_DMA_UNK1__SIZE);
 	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
-		so_data(so, nvws->channel->vram->handle);
+		so_data(so, chan->vram->handle);
 	so_method(so, screen->tesla, 0x121c, 1);
 	so_data  (so, 1);
 
@@ -290,27 +283,81 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->tesla, 0x16b8, 1);
 	so_data  (so, 8);
 
-	/* Shared constant buffer */
-	screen->constbuf = screen->pipe.buffer_create(&screen->pipe, 0, 0, 128 * 4 * 4);
-	if (nvws->res_init(&screen->vp_data_heap, 0, 128)) {
-		NOUVEAU_ERR("Error initialising constant buffer\n");
-		nv50_screen_destroy(&screen->pipe);
+	/* constant buffers for immediates and VP/FP parameters */
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4,
+			     &screen->constbuf_misc[0]);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
+	for (i = 0; i < 2; i++) {
+		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4,
+				     &screen->constbuf_parm[i]);
+		if (ret) {
+			nv50_screen_destroy(pscreen);
+			return NULL;
+		}
+	}
+
+	if (nouveau_resource_init(&screen->immd_heap[0], 0, 128) ||
+		nouveau_resource_init(&screen->parm_heap[0], 0, 128) ||
+		nouveau_resource_init(&screen->parm_heap[1], 0, 128))
+	{
+		NOUVEAU_ERR("Error initialising constant buffers.\n");
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/*
+	// map constant buffers:
+	//  B = buffer ID (maybe more than 1 byte)
+	//  N = CB index used in shader instruction
+	//  P = program type (0 = VP, 2 = GP, 3 = FP)
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x000BBNP1);
+	*/
+
 	so_method(so, screen->tesla, 0x1280, 3);
-	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+	so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+	so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, (NV50_CB_PMISC << 16) | 0x00001000);
+	so_data  (so, (NV50_CB_PMISC << 16) | 0x00000800);
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000001 | (NV50_CB_PMISC << 12));
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000031 | (NV50_CB_PMISC << 12));
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PVP << 16) | 0x00000800);
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000101 | (NV50_CB_PVP << 12));
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PFP << 16) | 0x00000800);
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000131 | (NV50_CB_PFP << 12));
 
 	/* Texture sampler/image unit setup - we abuse the constant buffer
 	 * upload mechanism for the moment to upload data to the tex config
 	 * blocks.  At some point we *may* want to go the NVIDIA way of doing
 	 * things?
 	 */
-	screen->tic = screen->pipe.buffer_create(&screen->pipe, 0, 0, 32 * 8 * 4);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tic);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
 	so_method(so, screen->tesla, 0x1280, 3);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
@@ -324,7 +371,12 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, 0x00000800);
 
-	screen->tsc = screen->pipe.buffer_create(&screen->pipe, 0, 0, 32 * 8 * 4);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tsc);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
 	so_method(so, screen->tesla, 0x1280, 3);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
@@ -352,14 +404,12 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 
 	so_method(so, screen->tesla, 0x1234, 1);
 	so_data  (so, 1);
-	so_method(so, screen->tesla, 0x1458, 1);
-	so_data  (so, 1);
 
-	so_emit(nvws, so);
+	so_emit(chan, so);
 	so_ref (so, &screen->static_init);
 	so_ref (NULL, &so);
-	nvws->push_flush(nvws, 0, NULL);
+	nouveau_pushbuf_flush(chan, 0);
 
-	return &screen->pipe;
+	return pscreen;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index db567aaac8..61e24a5b57 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -1,10 +1,10 @@
 #ifndef __NV50_SCREEN_H__
 #define __NV50_SCREEN_H__
 
-#include "pipe/p_screen.h"
+#include "nouveau/nouveau_screen.h"
 
 struct nv50_screen {
-	struct pipe_screen pipe;
+	struct nouveau_screen base;
 
 	struct nouveau_winsys *nvws;
 
@@ -15,11 +15,14 @@ struct nv50_screen {
 	struct nouveau_grobj *m2mf;
 	struct nouveau_notifier *sync;
 
-	struct pipe_buffer *constbuf;
-	struct nouveau_resource *vp_data_heap;
+	struct nouveau_bo *constbuf_misc[1];
+	struct nouveau_bo *constbuf_parm[2];
 
-	struct pipe_buffer *tic;
-	struct pipe_buffer *tsc;
+	struct nouveau_resource *immd_heap[1];
+	struct nouveau_resource *parm_heap[2];
+
+	struct nouveau_bo *tic;
+	struct nouveau_bo *tsc;
 
 	struct nouveau_stateobj *static_init;
 };
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index ba852194cd..116866a8e7 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -136,9 +136,11 @@ static void *
 nv50_sampler_state_create(struct pipe_context *pipe,
 			  const struct pipe_sampler_state *cso)
 {
-	unsigned *tsc = CALLOC(8, sizeof(unsigned));
+	struct nv50_sampler_stateobj *sso = CALLOC(1, sizeof(*sso));
+	unsigned *tsc = sso->tsc;
+	float limit;
 
-	tsc[0] = (0x00024000 |
+	tsc[0] = (0x00026000 |
 		  (wrap_mode(cso->wrap_s) << 0) |
 		  (wrap_mode(cso->wrap_t) << 3) |
 		  (wrap_mode(cso->wrap_r) << 6));
@@ -202,7 +204,14 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7);
 	}
 
-	return (void *)tsc;
+	limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+	tsc[1] |= ((int)(limit * 256.0) & 0x1fff) << 11;
+
+	tsc[2] |= ((int)CLAMP(cso->max_lod, 0.0, 15.0) << 20) |
+		  ((int)CLAMP(cso->min_lod, 0.0, 15.0) << 8);
+
+	sso->normalized = cso->normalized_coords;
+	return (void *)sso;
 }
 
 static void
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index c13d3de1cb..0caf4b4e91 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -32,6 +32,9 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	unsigned i, w, h, gw = 0;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
+		struct pipe_texture *pt = fb->cbufs[i]->texture;
+		struct nouveau_bo *bo = nv50_miptree(pt)->bo;
+
 		if (!gw) {
 			w = fb->cbufs[i]->width;
 			h = fb->cbufs[i]->height;
@@ -46,12 +49,10 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		so_data  (so, fb->cbufs[i]->height);
 
 		so_method(so, tesla, NV50TCL_RT_ADDRESS_HIGH(i), 5);
-		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
-			  NOUVEAU_BO_RDWR, 0, 0);
-		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
-			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->cbufs[i]->format) {
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
 			so_data(so, 0xcf);
@@ -65,7 +66,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, 0xe6);
 			break;
 		}
-		so_data(so, 0x00000000);
+		so_data(so, bo->tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1224, 1);
@@ -73,6 +74,9 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	}
 
 	if (fb->zsbuf) {
+		struct pipe_texture *pt = fb->zsbuf->texture;
+		struct nouveau_bo *bo = nv50_miptree(pt)->bo;
+
 		if (!gw) {
 			w = fb->zsbuf->width;
 			h = fb->zsbuf->height;
@@ -83,12 +87,10 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		}
 
 		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
-		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
-			  NOUVEAU_BO_RDWR, 0, 0);
-		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
-			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->zsbuf->format) {
 		case PIPE_FORMAT_Z24S8_UNORM:
 			so_data(so, 0x16);
@@ -102,7 +104,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, 0x16);
 			break;
 		}
-		so_data(so, 0x00000000);
+		so_data(so, bo->tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1538, 1);
@@ -131,7 +133,7 @@ static void
 nv50_state_emit(struct nv50_context *nv50)
 {
 	struct nv50_screen *screen = nv50->screen;
-	struct nouveau_winsys *nvws = screen->nvws;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	if (nv50->pctx_id != screen->cur_pctx) {
 		nv50->state.dirty |= 0xffffffff;
@@ -139,40 +141,40 @@ nv50_state_emit(struct nv50_context *nv50)
 	}
 
 	if (nv50->state.dirty & NV50_NEW_FRAMEBUFFER)
-		so_emit(nvws, nv50->state.fb);
+		so_emit(chan, nv50->state.fb);
 	if (nv50->state.dirty & NV50_NEW_BLEND)
-		so_emit(nvws, nv50->state.blend);
+		so_emit(chan, nv50->state.blend);
 	if (nv50->state.dirty & NV50_NEW_ZSA)
-		so_emit(nvws, nv50->state.zsa);
+		so_emit(chan, nv50->state.zsa);
 	if (nv50->state.dirty & NV50_NEW_VERTPROG)
-		so_emit(nvws, nv50->state.vertprog);
+		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
-		so_emit(nvws, nv50->state.fragprog);
+		so_emit(chan, nv50->state.fragprog);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
-		so_emit(nvws, nv50->state.rast);
+		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
-		so_emit(nvws, nv50->state.blend_colour);
+		so_emit(chan, nv50->state.blend_colour);
 	if (nv50->state.dirty & NV50_NEW_STIPPLE)
-		so_emit(nvws, nv50->state.stipple);
+		so_emit(chan, nv50->state.stipple);
 	if (nv50->state.dirty & NV50_NEW_SCISSOR)
-		so_emit(nvws, nv50->state.scissor);
+		so_emit(chan, nv50->state.scissor);
 	if (nv50->state.dirty & NV50_NEW_VIEWPORT)
-		so_emit(nvws, nv50->state.viewport);
+		so_emit(chan, nv50->state.viewport);
 	if (nv50->state.dirty & NV50_NEW_SAMPLER)
-		so_emit(nvws, nv50->state.tsc_upload);
+		so_emit(chan, nv50->state.tsc_upload);
 	if (nv50->state.dirty & NV50_NEW_TEXTURE)
-		so_emit(nvws, nv50->state.tic_upload);
+		so_emit(chan, nv50->state.tic_upload);
 	if (nv50->state.dirty & NV50_NEW_ARRAYS) {
-		so_emit(nvws, nv50->state.vtxfmt);
-		so_emit(nvws, nv50->state.vtxbuf);
+		so_emit(chan, nv50->state.vtxfmt);
+		so_emit(chan, nv50->state.vtxbuf);
 	}
 	nv50->state.dirty = 0;
 
-	so_emit_reloc_markers(nvws, nv50->state.fb);
-	so_emit_reloc_markers(nvws, nv50->state.vertprog);
-	so_emit_reloc_markers(nvws, nv50->state.fragprog);
-	so_emit_reloc_markers(nvws, nv50->state.vtxbuf);
-	so_emit_reloc_markers(nvws, nv50->screen->static_init);
+	so_emit_reloc_markers(chan, nv50->state.fb);
+	so_emit_reloc_markers(chan, nv50->state.vertprog);
+	so_emit_reloc_markers(chan, nv50->state.fragprog);
+	so_emit_reloc_markers(chan, nv50->state.vtxbuf);
+	so_emit_reloc_markers(chan, nv50->screen->static_init);
 }
 
 boolean
@@ -293,12 +295,12 @@ viewport_uptodate:
 		so_data  (so, NV50_CB_TSC);
 		so_method(so, tesla, 0x40000f04, nv50->sampler_nr * 8);
 		for (i = 0; i < nv50->sampler_nr; i++)
-			so_datap (so, nv50->sampler[i], 8);
+			so_datap (so, nv50->sampler[i]->tsc, 8);
 		so_ref(so, &nv50->state.tsc_upload);
 		so_ref(NULL, &so);
 	}
 
-	if (nv50->dirty & NV50_NEW_TEXTURE)
+	if (nv50->dirty & (NV50_NEW_TEXTURE | NV50_NEW_SAMPLER))
 		nv50_tex_validate(nv50);
 
 	if (nv50->dirty & NV50_NEW_ARRAYS)
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index 0cc5168144..8db3b6d344 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -52,21 +52,17 @@ static int
 nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
 {
 	struct nv50_miptree *mt = nv50_miptree(ps->texture);
-	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
-	struct nouveau_bo *bo;
+	struct nouveau_bo *bo = nv50_miptree(ps->texture)->bo;
  	int format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
  	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
- 
-	bo = screen->nvws->get_bo(nv50_miptree(ps->texture)->buffer);
-	if (!bo)
-		return 1;
 
  	format = nv50_format(ps->format);
  	if (format < 0)
  		return 1;
   
- 	if (!bo->tiled) {
+ 	if (!bo->tile_flags) {
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
@@ -80,7 +76,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		BEGIN_RING(chan, eng2d, mthd, 5);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 0);
- 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, bo->tile_mode << 4);
  		OUT_RING  (chan, 1);
  		OUT_RING  (chan, 0);
  		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
@@ -108,7 +104,7 @@ nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
 		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
 		     int w, int h)
 {
-	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
 	int ret;
 
@@ -165,7 +161,7 @@ nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
 {
 	struct nv50_context *nv50 = (struct nv50_context *)pipe;
 	struct nv50_screen *screen = nv50->screen;
-	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
 	int format, ret;
 
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index 223c8a3a45..ff40c2ad81 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -26,7 +26,8 @@
 #include "nouveau/nouveau_stateobj.h"
 
 static int
-nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
+nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
+		   struct nv50_miptree *mt, int unit)
 {
 	switch (mt->base.format) {
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
@@ -117,15 +118,18 @@ nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
 		return 1;
 	}
 
-	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+	so_reloc(so, mt->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
 		     NOUVEAU_BO_RD, 0, 0);
-	so_data (so, 0xd0005000);
+	if (nv50->sampler[unit]->normalized)
+		so_data (so, 0xd0005000 | mt->bo->tile_mode << 22);
+	else
+		so_data (so, 0x5001d000 | mt->bo->tile_mode << 22);
 	so_data (so, 0x00300000);
 	so_data (so, mt->base.width[0]);
-	so_data (so, (mt->base.depth[0] << 16) | mt->base.height[0]);
+	so_data (so, (mt->base.last_level << 28) |
+		     (mt->base.depth[0] << 16) | mt->base.height[0]);
 	so_data (so, 0x03000000);
-	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
-		     NOUVEAU_BO_RD, 0, 0);
+	so_data (so, mt->base.last_level << 4);
 
 	return 0;
 }
@@ -135,23 +139,35 @@ nv50_tex_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nouveau_stateobj *so;
-	int unit;
+	int unit, push;
+
+	push  = nv50->miptree_nr * 9 + 2;
+	push += MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2;
 
-	so = so_new(nv50->miptree_nr * 8 + 3, nv50->miptree_nr * 2);
+	so = so_new(push, nv50->miptree_nr * 2);
 	so_method(so, tesla, 0x0f00, 1);
 	so_data  (so, NV50_CB_TIC);
-	so_method(so, tesla, 0x40000f04, nv50->miptree_nr * 8);
 	for (unit = 0; unit < nv50->miptree_nr; unit++) {
 		struct nv50_miptree *mt = nv50->miptree[unit];
 
-		if (nv50_tex_construct(so, mt)) {
+		so_method(so, tesla, 0x40000f04, 8);
+		if (nv50_tex_construct(nv50, so, mt, unit)) {
 			NOUVEAU_ERR("failed tex validate\n");
 			so_ref(NULL, &so);
 			return;
 		}
+
+		so_method(so, tesla, 0x1458, 1);
+		so_data  (so, (unit << 9) | (unit << 1) | 1);
+	}
+
+	for (; unit < nv50->state.miptree_nr; unit++) {
+		so_method(so, tesla, 0x1458, 1);
+		so_data  (so, (unit << 1) | 0);
 	}
 
 	so_ref(so, &nv50->state.tic_upload);
 	so_ref(NULL, &so);
+	nv50->state.miptree_nr = nv50->miptree_nr;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index 747195b4f6..d0b7f0bef4 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -6,8 +6,8 @@
 
 struct nv50_transfer {
 	struct pipe_transfer base;
-	struct pipe_buffer *buffer;
-	struct nv50_miptree_level *level;
+	struct nouveau_bo *bo;
+	unsigned level_offset;
 	int level_pitch;
 	int level_width;
 	int level_height;
@@ -16,51 +16,48 @@ struct nv50_transfer {
 };
 
 static void
-nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct pipe_buffer *src,
-			int src_pitch, int sx, int sy, int sw, int sh,
-			struct pipe_buffer *dst, int dst_pitch, int dx, int dy,
+nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo,
+			unsigned src_offset, int src_pitch, int sx, int sy,
+			int sw, int sh, struct nouveau_bo *dst_bo,
+			unsigned dst_offset, int dst_pitch, int dx, int dy,
 			int dw, int dh, int cpp, int width, int height,
 			unsigned src_reloc, unsigned dst_reloc)
 {
 	struct nv50_screen *screen = nv50_screen(pscreen);
-	struct nouveau_winsys *nvws = screen->nvws;
-	struct nouveau_channel *chan = nvws->channel;
+	struct nouveau_channel *chan = screen->m2mf->channel;
 	struct nouveau_grobj *m2mf = screen->m2mf;
-	struct nouveau_bo *src_bo = nvws->get_bo(src);
-	struct nouveau_bo *dst_bo = nvws->get_bo(dst);
-	unsigned src_offset = 0, dst_offset = 0;
 
 	src_reloc |= NOUVEAU_BO_RD;
 	dst_reloc |= NOUVEAU_BO_WR;
 
 	WAIT_RING (chan, 14);
 
-	if (!src_bo->tiled) {
+	if (!src_bo->tile_flags) {
 		BEGIN_RING(chan, m2mf, 0x0200, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf, 0x0314, 1);
 		OUT_RING  (chan, src_pitch);
-		src_offset = (sy * src_pitch) + (sx * cpp);
+		src_offset += (sy * src_pitch) + (sx * cpp);
 	} else {
 		BEGIN_RING(chan, m2mf, 0x0200, 6);
 		OUT_RING  (chan, 0);
-		OUT_RING  (chan, 0);
+		OUT_RING  (chan, src_bo->tile_mode << 4);
 		OUT_RING  (chan, sw * cpp);
 		OUT_RING  (chan, sh);
 		OUT_RING  (chan, 1);
 		OUT_RING  (chan, 0);
 	}
 
-	if (!dst_bo->tiled) {
+	if (!dst_bo->tile_flags) {
 		BEGIN_RING(chan, m2mf, 0x021c, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf, 0x0318, 1);
 		OUT_RING  (chan, dst_pitch);
-		dst_offset = (dy * dst_pitch) + (dx * cpp);
+		dst_offset += (dy * dst_pitch) + (dx * cpp);
 	} else {
 		BEGIN_RING(chan, m2mf, 0x021c, 6);
 		OUT_RING  (chan, 0);
-		OUT_RING  (chan, 0);
+		OUT_RING  (chan, dst_bo->tile_mode << 4);
 		OUT_RING  (chan, dw * cpp);
 		OUT_RING  (chan, dh);
 		OUT_RING  (chan, 1);
@@ -77,13 +74,13 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct pipe_buffer *src,
 		BEGIN_RING(chan, m2mf, 0x030c, 2);
 		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
-		if (src_bo->tiled) {
+		if (src_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf, 0x0218, 1);
 			OUT_RING  (chan, (dy << 16) | sx);
 		} else {
 			src_offset += (line_count * src_pitch);
 		}
-		if (dst_bo->tiled) {
+		if (dst_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf, 0x0234, 1);
 			OUT_RING  (chan, (sy << 16) | dx);
 		} else {
@@ -108,10 +105,12 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		  enum pipe_transfer_usage usage,
 		  unsigned x, unsigned y, unsigned w, unsigned h)
 {
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
 	struct nv50_miptree *mt = nv50_miptree(pt);
 	struct nv50_miptree_level *lvl = &mt->level[level];
 	struct nv50_transfer *tx;
 	unsigned image = 0;
+	int ret;
 
 	if (pt->target == PIPE_TEXTURE_CUBE)
 		image = face;
@@ -133,20 +132,24 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	tx->base.stride = (w * pt->block.size);
 	tx->base.usage = usage;
 
-	tx->level = lvl;
 	tx->level_pitch = lvl->pitch;
 	tx->level_width = mt->base.width[level];
 	tx->level_height = mt->base.height[level];
+	tx->level_offset = lvl->image_offset[image];
 	tx->level_x = x;
 	tx->level_y = y;
-	tx->buffer =
-		pipe_buffer_create(pscreen, 0, NOUVEAU_BUFFER_USAGE_TRANSFER,
-				   w * tx->base.block.size * h);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
+			     w * pt->block.size * h, &tx->bo);
+	if (ret) {
+		FREE(tx);
+		return NULL;
+	}
 
 	if (usage != PIPE_TRANSFER_WRITE) {
-		nv50_transfer_rect_m2mf(pscreen, mt->buffer, tx->level_pitch,
-					x, y, tx->level_width, tx->level_height,
-					tx->buffer, tx->base.stride, 0, 0,
+		nv50_transfer_rect_m2mf(pscreen, mt->bo, tx->level_offset,
+					tx->level_pitch, x, y, tx->level_width,
+					tx->level_height, tx->bo, 0,
+					tx->base.stride, 0, 0,
 					tx->base.width, tx->base.height,
 					tx->base.block.size, w, h,
 					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART,
@@ -164,17 +167,18 @@ nv50_transfer_del(struct pipe_transfer *ptx)
 
 	if (ptx->usage != PIPE_TRANSFER_READ) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
-		nv50_transfer_rect_m2mf(pscreen, tx->buffer, tx->base.stride,
+		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, tx->base.stride,
 					0, 0, tx->base.width, tx->base.height,
-					mt->buffer, tx->level_pitch,
-					tx->level_x, tx->level_y,
-					tx->level_width, tx->level_height,
-					tx->base.block.size, tx->base.width,
-					tx->base.height, NOUVEAU_BO_GART,
-					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART);
+					mt->bo, tx->level_offset,
+					tx->level_pitch, tx->level_x,
+					tx->level_y, tx->level_width,
+					tx->level_height, tx->base.block.size,
+					tx->base.width, tx->base.height,
+					NOUVEAU_BO_GART, NOUVEAU_BO_VRAM |
+					NOUVEAU_BO_GART);
 	}
 
-	pipe_buffer_reference(&tx->buffer, NULL);
+	nouveau_bo_ref(NULL, &tx->bo);
 	pipe_texture_reference(&ptx->texture, NULL);
 	FREE(ptx);
 }
@@ -184,13 +188,17 @@ nv50_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 {
 	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
 	unsigned flags = 0;
+	int ret;
 
 	if (ptx->usage & PIPE_TRANSFER_WRITE)
-		flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
+		flags |= NOUVEAU_BO_WR;
 	if (ptx->usage & PIPE_TRANSFER_READ)
-		flags |= PIPE_BUFFER_USAGE_CPU_READ;
+		flags |= NOUVEAU_BO_RD;
 
-	return pipe_buffer_map(pscreen, tx->buffer, flags);
+	ret = nouveau_bo_map(tx->bo, flags);
+	if (ret)
+		return NULL;
+	return tx->bo->map;
 }
 
 static void
@@ -198,7 +206,7 @@ nv50_transfer_unmap(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 {
 	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
 
-	pipe_buffer_unmap(pscreen, tx->buffer);
+	nouveau_bo_unmap(tx->bo);
 }
 
 void
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 0749c90691..f81929f238 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -22,6 +22,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "nv50_context.h"
 
@@ -53,7 +54,7 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 		 unsigned count)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	nv50_state_validate(nv50);
@@ -83,7 +84,7 @@ static INLINE void
 nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 			      unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	map += start;
@@ -112,7 +113,7 @@ static INLINE void
 nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 			      unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	map += start;
@@ -141,7 +142,7 @@ static INLINE void
 nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint8_t *map,
 			      unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	map += start;
@@ -163,10 +164,12 @@ nv50_draw_elements(struct pipe_context *pipe,
 		   unsigned mode, unsigned start, unsigned count)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct pipe_winsys *ws = pipe->winsys;
-	void *map = ws->buffer_map(ws, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+	struct pipe_screen *pscreen = pipe->screen;
+	void *map;
+	
+	map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
 
 	nv50_state_validate(nv50);
 
@@ -193,6 +196,7 @@ nv50_draw_elements(struct pipe_context *pipe,
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
+	pipe_buffer_unmap(pscreen, indexBuffer);
 	pipe->flush(pipe, 0, NULL);
 	return TRUE;
 }
@@ -212,6 +216,7 @@ nv50_vbo_validate(struct nv50_context *nv50)
 		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
 		struct pipe_vertex_buffer *vb =
 			&nv50->vtxbuf[ve->vertex_buffer_index];
+		struct nouveau_bo *bo = nouveau_bo(vb->buffer);
 
 		switch (ve->src_format) {
 		case PIPE_FORMAT_R32G32B32A32_FLOAT:
@@ -240,10 +245,10 @@ nv50_vbo_validate(struct nv50_context *nv50)
 
 		so_method(vtxbuf, tesla, 0x900 + (i * 16), 3);
 		so_data  (vtxbuf, 0x20000000 | vb->stride);
-		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	}
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index 758f706c51..00fae8d26f 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -149,6 +149,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
             caps->num_vert_fpus = 6;
             break;
 
+        case 0x4B48:
         case 0x4B49:
         case 0x4B4A:
         case 0x4B4B:
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 58f1fa0e2e..27bc7fd1a9 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -117,23 +117,24 @@ struct r300_viewport_state {
     uint32_t vte_control; /* R300_VAP_VTE_CNTL:      0x20b0 */
 };
 
-#define R300_NEW_BLEND           0x0000001
-#define R300_NEW_BLEND_COLOR     0x0000002
-#define R300_NEW_CONSTANTS       0x0000004
-#define R300_NEW_DSA             0x0000008
-#define R300_NEW_FRAMEBUFFERS    0x0000010
-#define R300_NEW_FRAGMENT_SHADER 0x0000020
-#define R300_NEW_RASTERIZER      0x0000040
-#define R300_NEW_RS_BLOCK        0x0000080
-#define R300_NEW_SAMPLER         0x0000100
-#define R300_ANY_NEW_SAMPLERS    0x000ff00
-#define R300_NEW_SCISSOR         0x0010000
-#define R300_NEW_TEXTURE         0x0020000
-#define R300_ANY_NEW_TEXTURES    0x1fe0000
-#define R300_NEW_VERTEX_FORMAT   0x2000000
-#define R300_NEW_VERTEX_SHADER   0x4000000
-#define R300_NEW_VIEWPORT        0x8000000
-#define R300_NEW_KITCHEN_SINK    0xfffffff
+#define R300_NEW_BLEND           0x00000001
+#define R300_NEW_BLEND_COLOR     0x00000002
+#define R300_NEW_CLIP            0x00000004
+#define R300_NEW_CONSTANTS       0x00000008
+#define R300_NEW_DSA             0x00000010
+#define R300_NEW_FRAMEBUFFERS    0x00000020
+#define R300_NEW_FRAGMENT_SHADER 0x00000040
+#define R300_NEW_RASTERIZER      0x00000080
+#define R300_NEW_RS_BLOCK        0x00000100
+#define R300_NEW_SAMPLER         0x00000200
+#define R300_ANY_NEW_SAMPLERS    0x0001fe00
+#define R300_NEW_SCISSOR         0x00020000
+#define R300_NEW_TEXTURE         0x00040000
+#define R300_ANY_NEW_TEXTURES    0x03fc0000
+#define R300_NEW_VERTEX_FORMAT   0x04000000
+#define R300_NEW_VERTEX_SHADER   0x08000000
+#define R300_NEW_VIEWPORT        0x10000000
+#define R300_NEW_KITCHEN_SINK    0x1fffffff
 
 /* The next several objects are not pure Radeon state; they inherit from
  * various Gallium classes. */
@@ -141,11 +142,11 @@ struct r300_viewport_state {
 struct r300_constant_buffer {
     /* Buffer of constants */
     /* XXX first number should be raised */
-    float constants[8][4];
+    float constants[32][4];
     /* Number of user-defined constants */
-    int user_count;
+    unsigned user_count;
     /* Total number of constants */
-    int count;
+    unsigned count;
 };
 
 struct r3xx_fragment_shader {
@@ -158,6 +159,10 @@ struct r3xx_fragment_shader {
 
     /* Pixel stack size */
     int stack_size;
+
+    /* Are there immediates in this shader?
+     * If not, we can heavily optimize recompilation. */
+    boolean uses_imms;
 };
 
 struct r300_fragment_shader {
@@ -248,6 +253,10 @@ struct r300_vertex_shader {
     /* Has this shader been translated yet? */
     boolean translated;
 
+    /* Are there immediates in this shader?
+     * If not, we can heavily optimize recompilation. */
+    boolean uses_imms;
+
     /* Number of used instructions */
     int instruction_count;
 
@@ -284,6 +293,8 @@ struct r300_context {
     struct r300_blend_state* blend_state;
     /* Blend color state. */
     struct r300_blend_color_state* blend_color_state;
+    /* User clip planes. */
+    struct pipe_clip_state clip_state;
     /* Shader constants. */
     struct r300_constant_buffer shader_constants[PIPE_SHADER_TYPES];
     /* Depth, stencil, and alpha state. */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 82a3942248..71b142c0db 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -34,6 +34,7 @@
 
 #define MAX_CS_SIZE 64 * 1024 / 4
 
+#define VERY_VERBOSE_CS 0
 #define VERY_VERBOSE_REGISTERS 0
 
 /* XXX stolen from radeon_drm.h */
@@ -56,8 +57,10 @@
 
 #define BEGIN_CS(size) do { \
     CHECK_CS(size); \
-    debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
-        size, __FUNCTION__, __FILE__, __LINE__); \
+    if (VERY_VERBOSE_CS) { \
+        debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
+                size, __FUNCTION__, __FILE__, __LINE__); \
+    } \
     cs_winsys->begin_cs(cs_winsys, (size), \
             __FILE__, __FUNCTION__, __LINE__); \
     cs_count = size; \
@@ -93,8 +96,9 @@
 } while (0)
 
 #define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for buffer %p, offset %d\n", \
-        bo, offset); \
+    debug_printf("r300: writing relocation for buffer %p, offset %d, " \
+            "domains (%d, %d, %d)\n", \
+        bo, offset, rd, wd, flags); \
     assert(bo); \
     OUT_CS(offset); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
@@ -102,16 +106,20 @@
 } while (0)
 
 #define END_CS do { \
-    debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, __FILE__, \
-        __LINE__); \
+    if (VERY_VERBOSE_CS) { \
+        debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
+                __FILE__, __LINE__); \
+    } \
     if (cs_count != 0) \
         debug_printf("r300: Warning: cs_count off by %d\n", cs_count); \
     cs_winsys->end_cs(cs_winsys, __FILE__, __FUNCTION__, __LINE__); \
 } while (0)
 
 #define FLUSH_CS do { \
-    debug_printf("r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, __FILE__, \
-        __LINE__); \
+    if (VERY_VERBOSE_CS) { \
+        debug_printf("r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
+                __FILE__, __LINE__); \
+    } \
     cs_winsys->flush_cs(cs_winsys); \
 } while (0)
 
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
index ffc93eb591..678cd2b812 100644
--- a/src/gallium/drivers/r300/r300_debug.c
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -152,6 +152,8 @@ void r500_fs_dump(struct r500_fragment_shader* fs)
 
 static void r300_vs_op_dump(uint32_t op)
 {
+    debug_printf(" dst: %d%s op: ",
+            (op >> 13) & 0x7f, r300_vs_dst_debug[(op >> 8) & 0x7]);
     if (op & 0x80) {
         if (op & 0x1) {
             debug_printf("PVS_MACRO_OP_2CLK_M2X_ADD\n");
@@ -165,14 +167,32 @@ static void r300_vs_op_dump(uint32_t op)
     }
 }
 
+void r300_vs_src_dump(uint32_t src)
+{
+    debug_printf(" reg: %d%s swiz: %s%s/%s%s/%s%s/%s%s\n",
+            (src >> 5) & 0x7f, r300_vs_src_debug[src & 0x3],
+            src & (1 << 25) ? "-" : " ",
+            r300_vs_swiz_debug[(src >> 13) & 0x7],
+            src & (1 << 26) ? "-" : " ",
+            r300_vs_swiz_debug[(src >> 16) & 0x7],
+            src & (1 << 27) ? "-" : " ",
+            r300_vs_swiz_debug[(src >> 19) & 0x7],
+            src & (1 << 28) ? "-" : " ",
+            r300_vs_swiz_debug[(src >> 22) & 0x7]);
+}
+
 void r300_vs_dump(struct r300_vertex_shader* vs)
 {
     int i;
 
     for (i = 0; i < vs->instruction_count; i++) {
+        debug_printf("%d: op: 0x%08x", i, vs->instructions[i].inst0);
         r300_vs_op_dump(vs->instructions[i].inst0);
-        debug_printf("inst1: 0x%x\n", vs->instructions[i].inst1);
-        debug_printf("inst2: 0x%x\n", vs->instructions[i].inst2);
-        debug_printf("inst3: 0x%x\n", vs->instructions[i].inst3);
+        debug_printf(" src0: 0x%08x", vs->instructions[i].inst1);
+        r300_vs_src_dump(vs->instructions[i].inst1);
+        debug_printf(" src1: 0x%08x", vs->instructions[i].inst2);
+        r300_vs_src_dump(vs->instructions[i].inst2);
+        debug_printf(" src2: 0x%08x", vs->instructions[i].inst3);
+        r300_vs_src_dump(vs->instructions[i].inst3);
     }
 }
diff --git a/src/gallium/drivers/r300/r300_debug.h b/src/gallium/drivers/r300/r300_debug.h
index 6306594099..c86410ec0a 100644
--- a/src/gallium/drivers/r300/r300_debug.h
+++ b/src/gallium/drivers/r300/r300_debug.h
@@ -173,6 +173,36 @@ static char* r300_vs_me_ops[] = {
     "               (reserved)",
 };
 
+/* XXX refactor to avoid clashing symbols */
+static char* r300_vs_src_debug[] = {
+    "t",
+    "i",
+    "c",
+    "a",
+};
+
+static char* r300_vs_dst_debug[] = {
+    "t",
+    "a0",
+    "o",
+    "ox",
+    "a",
+    "i",
+    "u",
+    "u",
+};
+
+static char* r300_vs_swiz_debug[] = {
+    "X",
+    "Y",
+    "Z",
+    "W",
+    "0",
+    "1",
+    "U",
+    "U",
+};
+
 void r500_fs_dump(struct r500_fragment_shader* fs);
 
 void r300_vs_dump(struct r300_vertex_shader* vs);
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 0cb0507fc8..d81abe4d0b 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -56,6 +56,30 @@ void r300_emit_blend_color_state(struct r300_context* r300,
     }
 }
 
+void r300_emit_clip_state(struct r300_context* r300,
+                          struct pipe_clip_state* clip)
+{
+    int i;
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    CS_LOCALS(r300);
+
+    BEGIN_CS(5 + (6 * 4));
+    OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
+            (r300screen->caps->is_r500 ?
+             R500_PVS_UCP_START : R300_PVS_UCP_START));
+    OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, 6 * 4);
+    for (i = 0; i < 6; i++) {
+        OUT_CS_32F(clip->ucp[i][0]);
+        OUT_CS_32F(clip->ucp[i][1]);
+        OUT_CS_32F(clip->ucp[i][2]);
+        OUT_CS_32F(clip->ucp[i][3]);
+    }
+
+    OUT_CS_REG(R300_VAP_CLIP_CNTL, ((1 << clip->nr) - 1) |
+            R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
+    END_CS;
+}
+
 void r300_emit_dsa_state(struct r300_context* r300,
                            struct r300_dsa_state* dsa)
 {
@@ -372,17 +396,22 @@ void r300_emit_vertex_shader(struct r300_context* r300,
     }
 
     if (constants->count) {
-        BEGIN_CS(16 + (vs->instruction_count * 4) + (constants->count * 4));
+        BEGIN_CS(14 + (vs->instruction_count * 4) + (constants->count * 4));
     } else {
-        BEGIN_CS(13 + (vs->instruction_count * 4) + (constants->count * 4));
+        BEGIN_CS(11 + (vs->instruction_count * 4));
     }
 
-    OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_0, R300_PVS_FIRST_INST(0) |
+    /* R300_VAP_PVS_CODE_CNTL_0
+     * R300_VAP_PVS_CONST_CNTL
+     * R300_VAP_PVS_CODE_CNTL_1
+     * See the r5xx docs for instructions on how to use these.
+     * XXX these could be optimized to select better values... */
+    OUT_CS_REG_SEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+    OUT_CS(R300_PVS_FIRST_INST(0) |
+            R300_PVS_XYZW_VALID_INST(vs->instruction_count - 1) |
             R300_PVS_LAST_INST(vs->instruction_count - 1));
-    OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, vs->instruction_count - 1);
-
-    /* XXX */
-    OUT_CS_REG(R300_VAP_PVS_CONST_CNTL, 0x0);
+    OUT_CS(R300_PVS_MAX_CONST_ADDR(constants->count - 1));
+    OUT_CS(vs->instruction_count - 1);
 
     OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0);
     OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, vs->instruction_count * 4);
@@ -412,7 +441,6 @@ void r300_emit_vertex_shader(struct r300_context* r300,
             R300_PVS_VF_MAX_VTX_NUM(12));
     OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
     END_CS;
-
 }
 
 void r300_emit_viewport_state(struct r300_context* r300,
@@ -452,8 +480,8 @@ void r300_emit_dirty_state(struct r300_context* r300)
 {
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct r300_texture* tex;
-    int i;
-    int dirty_tex = 0;
+    int i, dirty_tex = 0;
+    boolean invalid = FALSE;
 
     if (!(r300->dirty_state)) {
         return;
@@ -462,38 +490,55 @@ void r300_emit_dirty_state(struct r300_context* r300)
     r300_update_derived_state(r300);
 
     /* XXX check size */
+validate:
     /* Color buffers... */
     for (i = 0; i < r300->framebuffer_state.nr_cbufs; i++) {
         tex = (struct r300_texture*)r300->framebuffer_state.cbufs[i]->texture;
         assert(tex && tex->buffer && "cbuf is marked, but NULL!");
-        r300->winsys->add_buffer(r300->winsys, tex->buffer,
-                0, RADEON_GEM_DOMAIN_VRAM);
+        if (!r300->winsys->add_buffer(r300->winsys, tex->buffer,
+                    0, RADEON_GEM_DOMAIN_VRAM)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
     }
     /* ...depth buffer... */
     if (r300->framebuffer_state.zsbuf) {
         tex = (struct r300_texture*)r300->framebuffer_state.zsbuf->texture;
         assert(tex && tex->buffer && "zsbuf is marked, but NULL!");
-        r300->winsys->add_buffer(r300->winsys, tex->buffer,
-                0, RADEON_GEM_DOMAIN_VRAM);
+        if (!r300->winsys->add_buffer(r300->winsys, tex->buffer,
+                    0, RADEON_GEM_DOMAIN_VRAM)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
     }
     /* ...textures... */
     for (i = 0; i < r300->texture_count; i++) {
         tex = r300->textures[i];
         assert(tex && tex->buffer && "texture is marked, but NULL!");
-        r300->winsys->add_buffer(r300->winsys, tex->buffer,
-                RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+        if (!r300->winsys->add_buffer(r300->winsys, tex->buffer,
+                    RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
     }
     /* ...and vertex buffer. */
     if (r300->vbo) {
-        r300->winsys->add_buffer(r300->winsys, r300->vbo,
-                RADEON_GEM_DOMAIN_GTT, 0);
+        if (!r300->winsys->add_buffer(r300->winsys, r300->vbo,
+                    RADEON_GEM_DOMAIN_GTT, 0)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
     } else {
         debug_printf("No VBO while emitting dirty state!\n");
     }
-
     if (r300->winsys->validate(r300->winsys)) {
-        /* XXX */
         r300->context.flush(&r300->context, 0, NULL);
+        if (invalid) {
+            /* Well, hell. */
+            exit(1);
+        }
+        invalid = TRUE;
+        goto validate;
     }
 
     if (r300->dirty_state & R300_NEW_BLEND) {
@@ -506,6 +551,11 @@ void r300_emit_dirty_state(struct r300_context* r300)
         r300->dirty_state &= ~R300_NEW_BLEND_COLOR;
     }
 
+    if (r300->dirty_state & R300_NEW_CLIP) {
+        r300_emit_clip_state(r300, &r300->clip_state);
+        r300->dirty_state &= ~R300_NEW_CLIP;
+    }
+
     if (r300->dirty_state & R300_NEW_DSA) {
         r300_emit_dsa_state(r300, r300->dsa_state);
         r300->dirty_state &= ~R300_NEW_DSA;
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 36e14f69f7..946f625bd8 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -36,6 +36,9 @@ void r300_emit_blend_state(struct r300_context* r300,
 void r300_emit_blend_color_state(struct r300_context* r300,
                                  struct r300_blend_color_state* bc);
 
+void r300_emit_clip_state(struct r300_context* r300,
+                          struct pipe_clip_state* clip);
+
 void r300_emit_dsa_state(struct r300_context* r300,
                          struct r300_dsa_state* dsa);
 
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 660816e1da..3bb9bc47b5 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -511,11 +511,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_PVS_XYZW_VALID_INST_SHIFT    10
 #       define R300_PVS_LAST_INST_SHIFT          20
 #       define R300_PVS_FIRST_INST(x)            ((x) << 0)
+#       define R300_PVS_XYZW_VALID_INST(x)       ((x) << 10)
 #       define R300_PVS_LAST_INST(x)             ((x) << 20)
 /* Addresses are relative the the vertex program parameters area. */
 #define R300_VAP_PVS_CONST_CNTL             0x22D4
 #       define R300_PVS_CONST_BASE_OFFSET_SHIFT  0
 #       define R300_PVS_MAX_CONST_ADDR_SHIFT     16
+#       define R300_PVS_MAX_CONST_ADDR(x)        ((x) << 16)
 #define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
@@ -3040,6 +3042,7 @@ enum {
 #   define R500_INST_RGB_WMASK_R			(1 << 11)
 #   define R500_INST_RGB_WMASK_G			(1 << 12)
 #   define R500_INST_RGB_WMASK_B			(1 << 13)
+#   define R500_INST_RGB_WMASK_RGB			(7 << 11)
 #   define R500_INST_ALPHA_WMASK			(1 << 14)
 #   define R500_INST_RGB_OMASK_R			(1 << 15)
 #   define R500_INST_RGB_OMASK_G			(1 << 16)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 29b66cee7e..cd458d019a 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -45,11 +45,7 @@ struct r300_render {
 
     /* VBO */
     struct pipe_buffer* vbo;
-    size_t vbo_size;
-    size_t vbo_offset;
-    void* vbo_map;
     size_t vbo_alloc_size;
-    size_t vbo_max_used;
 };
 
 static INLINE struct r300_render*
@@ -78,24 +74,21 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
     struct pipe_screen* screen = r300->context.screen;
     size_t size = (size_t)vertex_size * (size_t)count;
 
-    if (r300render->vbo) {
+    if (r300render->vbo && (size > r300render->vbo_alloc_size)) {
         pipe_buffer_reference(&r300render->vbo, NULL);
     }
+    
+    if (!r300render->vbo) {
+        r300render->vbo = pipe_buffer_create(screen,
+                                             64,
+                                             PIPE_BUFFER_USAGE_VERTEX,
+                                             size);
+    }
 
-    r300render->vbo_size = MAX2(size, r300render->vbo_alloc_size);
-    r300render->vbo_offset = 0;
-    r300render->vbo = pipe_buffer_create(screen,
-                                         64,
-                                         PIPE_BUFFER_USAGE_VERTEX,
-                                         r300render->vbo_size);
-
+    r300render->vbo_alloc_size = MAX2(size, r300render->vbo_alloc_size);
     r300render->vertex_size = vertex_size;
 
-    if (r300render->vbo) {
-        return TRUE;
-    } else {
-        return FALSE;
-    }
+    return (r300render->vbo) ? TRUE : FALSE;
 }
 
 static void* r300_render_map_vertices(struct vbuf_render* render)
@@ -103,10 +96,8 @@ static void* r300_render_map_vertices(struct vbuf_render* render)
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
 
-    r300render->vbo_map = pipe_buffer_map(screen, r300render->vbo,
-                                          PIPE_BUFFER_USAGE_CPU_WRITE);
-
-    return (unsigned char*)r300render->vbo_map + r300render->vbo_offset;
+    return (unsigned char*)pipe_buffer_map(screen, r300render->vbo,
+                                           PIPE_BUFFER_USAGE_CPU_WRITE);
 }
 
 static void r300_render_unmap_vertices(struct vbuf_render* render,
@@ -116,9 +107,6 @@ static void r300_render_unmap_vertices(struct vbuf_render* render,
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
 
-    r300render->vbo_max_used = MAX2(r300render->vbo_max_used,
-             r300render->vertex_size * (max + 1));
-
     pipe_buffer_unmap(screen, r300render->vbo);
 }
 
@@ -181,7 +169,6 @@ static void prepare_render(struct r300_render* render, unsigned count)
     CS_LOCALS(r300);
 
     r300->vbo = render->vbo;
-    r300->vbo_offset = render->vbo_offset;
 
     r300_emit_dirty_state(r300);
 }
@@ -195,8 +182,6 @@ static void r300_render_draw_arrays(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    r300render->vbo_offset = start;
-
     prepare_render(r300render, count);
 
     debug_printf("r300: Doing vbuf render, count %d\n", count);
@@ -231,13 +216,14 @@ static void r300_render_draw(struct vbuf_render* render,
         return;
     }
 
+/*
     index_map = pipe_buffer_map(screen, index_buffer,
                                 PIPE_BUFFER_USAGE_CPU_WRITE);
     memcpy(index_map, indices, count);
     pipe_buffer_unmap(screen, index_buffer);
 
     debug_printf("r300: Doing indexbuf render, count %d\n", count);
-/*
+
     BEGIN_CS(8);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
@@ -247,13 +233,15 @@ static void r300_render_draw(struct vbuf_render* render,
     OUT_CS_INDEX_RELOC(index_buffer, 0, count, RADEON_GEM_DOMAIN_GTT, 0, 0);
     END_CS; */
 
-    BEGIN_CS(2 + count);
-    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, count);
+    BEGIN_CS(2 + (count+1)/2);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (count+1)/2);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
-           r300render->hwprim | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
-    for (i = 0; i < count; i++) {
-        index = indices[i];
-        OUT_CS(index);
+           r300render->hwprim);
+    for (i = 0; i < count-1; i += 2) {
+        OUT_CS(indices[i+1] << 16 | indices[i]);
+    }
+    if (count % 2) {
+        OUT_CS(indices[count-1]);
     }
     END_CS;
 }
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 4e65fbbabe..29e721984f 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -119,9 +119,10 @@ static void r300_set_clip_state(struct pipe_context* pipe,
                                 const struct pipe_clip_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
-    /* XXX add HW TCL clipping setup */
-    draw_flush(r300->draw);
-    draw_set_clip_state(r300->draw, state);
+
+    r300->clip_state = *state;
+
+    r300->dirty_state |= R300_NEW_CLIP;
 }
 
 static void
@@ -151,10 +152,12 @@ static void
 
     /* If the number of constants have changed, invalidate the shader. */
     if (r300->shader_constants[shader].user_count != i) {
-        if (shader == PIPE_SHADER_FRAGMENT && r300->fs) {
+        if (shader == PIPE_SHADER_FRAGMENT && r300->fs &&
+                r300->fs->uses_imms) {
             r300->fs->translated = FALSE;
             r300_translate_fragment_shader(r300, r300->fs);
-        } else if (shader == PIPE_SHADER_VERTEX && r300->vs) {
+        } else if (shader == PIPE_SHADER_VERTEX && r300->vs &&
+                r300->vs->uses_imms) {
             r300->vs->translated = FALSE;
             r300_translate_vertex_shader(r300, r300->vs);
         }
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index caa5f3b543..2477b30822 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -64,6 +64,7 @@ static void r300_vs_tab_routes(struct r300_context* r300,
                     break;
                 case TGSI_SEMANTIC_FOG:
                     fog = TRUE;
+                    /* Fall through */
                 case TGSI_SEMANTIC_GENERIC:
                     texs++;
                     break;
@@ -103,6 +104,9 @@ static void r300_vs_tab_routes(struct r300_context* r300,
         }
     }
 
+    /* XXX magic */
+    assert(texs <= 8);
+
     /* Do the actual vertex_info setup.
      *
      * vertex_info has four uints of hardware-specific data in it.
@@ -140,21 +144,32 @@ static void r300_vs_tab_routes(struct r300_context* r300,
         vinfo->hwfmt[2] |= (R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i);
     }
 
-    for (i = 0; i < texs; i++) {
+    /* Init i right here, increment it if fog is enabled.
+     * This gets around a double-increment problem. */
+    i = 0;
+
+    if (fog) {
+        i++;
         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
+            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_FOG, 0));
         vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
         vinfo->hwfmt[3] |= (4 << (3 * i));
     }
 
-    if (fog) {
-        i++;
+    for (i; i < texs; i++) {
         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_FOG, 0));
+            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
         vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
         vinfo->hwfmt[3] |= (4 << (3 * i));
     }
 
+    /* Handle the case where the vertex shader will be generating some of
+     * the attribs based on its inputs. */
+    if (r300screen->caps->has_tcl &&
+            info->num_inputs < info->num_outputs) {
+        vinfo->num_attribs = info->num_inputs;
+    }
+
     draw_compute_vertex_size(vinfo);
 }
 
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index 9dde662802..60eff08f2e 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -69,7 +69,7 @@ void r300_emit_invariant_state(struct r300_context* r300)
     END_CS;
 
     /* XXX unsorted stuff from surface_fill */
-    BEGIN_CS(77 + (caps->has_tcl ? 7 : 0));
+    BEGIN_CS(75 + (caps->has_tcl ? 7 : 0));
     /* Flush PVS. */
     OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
 
@@ -80,8 +80,6 @@ void r300_emit_invariant_state(struct r300_context* r300)
     /* XXX endian */
     if (caps->has_tcl) {
         OUT_CS_REG(R300_VAP_CNTL_STATUS, R300_VC_NO_SWAP);
-        OUT_CS_REG(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE |
-            R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
         OUT_CS_REG_SEQ(R300_VAP_GB_VERT_CLIP_ADJ, 4);
         OUT_CS_32F(1.0);
         OUT_CS_32F(1.0);
diff --git a/src/gallium/drivers/r300/r300_state_shader.c b/src/gallium/drivers/r300/r300_state_shader.c
index ed99c76c15..cc7f6a7c4b 100644
--- a/src/gallium/drivers/r300/r300_state_shader.c
+++ b/src/gallium/drivers/r300/r300_state_shader.c
@@ -22,24 +22,6 @@
 
 #include "r300_state_shader.h"
 
-static void r300_copy_passthrough_shader(struct r300_fragment_shader* fs)
-{
-    struct r300_fragment_shader* pt = &r300_passthrough_fragment_shader;
-    fs->shader.stack_size = pt->shader.stack_size;
-    fs->alu_instruction_count = pt->alu_instruction_count;
-    fs->tex_instruction_count = pt->tex_instruction_count;
-    fs->indirections = pt->indirections;
-    fs->instructions[0] = pt->instructions[0];
-}
-
-static void r500_copy_passthrough_shader(struct r500_fragment_shader* fs)
-{
-    struct r500_fragment_shader* pt = &r500_passthrough_fragment_shader;
-    fs->shader.stack_size = pt->shader.stack_size;
-    fs->instruction_count = pt->instruction_count;
-    fs->instructions[0] = pt->instructions[0];
-}
-
 static void r300_fs_declare(struct r300_fs_asm* assembler,
                             struct tgsi_full_declaration* decl)
 {
@@ -49,6 +31,7 @@ static void r300_fs_declare(struct r300_fs_asm* assembler,
                 case TGSI_SEMANTIC_COLOR:
                     assembler->color_count++;
                     break;
+                case TGSI_SEMANTIC_FOG:
                 case TGSI_SEMANTIC_GENERIC:
                     assembler->tex_count++;
                     break;
@@ -323,9 +306,12 @@ static INLINE void r300_emit_maths(struct r300_fragment_shader* fs,
 }
 
 /* Setup an ALU operation. */
-static INLINE void r500_emit_alu(struct r500_fragment_shader* fs,
-                                 struct r300_fs_asm* assembler,
-                                 struct tgsi_full_dst_register* dst)
+static INLINE void r500_emit_maths(struct r500_fragment_shader* fs,
+                                   struct r300_fs_asm* assembler,
+                                   struct tgsi_full_src_register* src,
+                                   struct tgsi_full_dst_register* dst,
+                                   unsigned op,
+                                   unsigned count)
 {
     int i = fs->instruction_count;
 
@@ -348,18 +334,6 @@ static INLINE void r500_emit_alu(struct r500_fragment_shader* fs,
         R500_ALPHA_ADDRD(r300_fs_dst(assembler, &dst->DstRegister));
     fs->instructions[i].inst5 =
         R500_ALU_RGBA_ADDRD(r300_fs_dst(assembler, &dst->DstRegister));
-}
-
-static INLINE void r500_emit_maths(struct r500_fragment_shader* fs,
-                                   struct r300_fs_asm* assembler,
-                                   struct tgsi_full_src_register* src,
-                                   struct tgsi_full_dst_register* dst,
-                                   unsigned op,
-                                   unsigned count)
-{
-    int i = fs->instruction_count;
-
-    r500_emit_alu(fs, assembler, dst);
 
     switch (count) {
         case 3:
@@ -381,8 +355,8 @@ static INLINE void r500_emit_maths(struct r500_fragment_shader* fs,
                 R500_ALU_RGB_SEL_B_SRC1 |
                 R500_SWIZ_RGB_B(r500_rgb_swiz(&src[1]));
             fs->instructions[i].inst4 |=
-                R500_SWIZ_ALPHA_B(r500_alpha_swiz(&src[1])) |
-                R500_ALPHA_SEL_B_SRC1;
+                R500_ALPHA_SEL_B_SRC1 |
+                R500_SWIZ_ALPHA_B(r500_alpha_swiz(&src[1]));
         case 1:
         case 0:
         default:
@@ -394,8 +368,8 @@ static INLINE void r500_emit_maths(struct r500_fragment_shader* fs,
                 R500_ALU_RGB_SEL_A_SRC0 |
                 R500_SWIZ_RGB_A(r500_rgb_swiz(&src[0]));
             fs->instructions[i].inst4 |=
-                R500_SWIZ_ALPHA_A(r500_alpha_swiz(&src[0])) |
-                R500_ALPHA_SEL_A_SRC0;
+                R500_ALPHA_SEL_A_SRC0 |
+                R500_SWIZ_ALPHA_A(r500_alpha_swiz(&src[0]));
             break;
     }
 
@@ -565,6 +539,60 @@ static void r500_fs_instruction(struct r500_fragment_shader* fs,
                     &inst->FullDstRegisters[0], inst->Instruction.Opcode, 3);
             break;
 
+        /* The compound and hybrid insts. */
+        case TGSI_OPCODE_LRP:
+            /* LRP DST A, B, C -> MAD TMP -A, C, C; MAD DST A, B, TMP */
+            inst->FullSrcRegisters[3] = inst->FullSrcRegisters[1];
+            inst->FullSrcRegisters[1] = inst->FullSrcRegisters[2];
+            inst->FullSrcRegisters[0].SrcRegister.Negate =
+                !(inst->FullSrcRegisters[0].SrcRegister.Negate);
+            inst->FullDstRegisters[1] = inst->FullDstRegisters[0];
+            inst->FullDstRegisters[0].DstRegister.Index =
+                assembler->temp_count;
+            inst->FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+            r500_emit_maths(fs, assembler, inst->FullSrcRegisters,
+                    &inst->FullDstRegisters[0], TGSI_OPCODE_MAD, 3);
+            inst->FullSrcRegisters[2].SrcRegister.Index =
+                assembler->temp_count;
+            inst->FullSrcRegisters[2].SrcRegister.File = TGSI_FILE_TEMPORARY;
+            inst->FullSrcRegisters[2].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+            inst->FullSrcRegisters[2].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+            inst->FullSrcRegisters[2].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Z;
+            inst->FullSrcRegisters[2].SrcRegister.SwizzleW = TGSI_SWIZZLE_W;
+            inst->FullSrcRegisters[1] = inst->FullSrcRegisters[3];
+            inst->FullSrcRegisters[0].SrcRegister.Negate =
+                !(inst->FullSrcRegisters[0].SrcRegister.Negate);
+            inst->FullDstRegisters[0] = inst->FullDstRegisters[1];
+            r500_emit_maths(fs, assembler, inst->FullSrcRegisters,
+                    &inst->FullDstRegisters[0], TGSI_OPCODE_MAD, 3);
+            break;
+        case TGSI_OPCODE_POW:
+            /* POW DST A, B -> LG2 TMP A; MUL TMP TMP, B; EX2 DST TMP */
+            inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleW =
+                inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtSwizzleX;
+            inst->FullSrcRegisters[0].SrcRegister.SwizzleW =
+                inst->FullSrcRegisters[0].SrcRegister.SwizzleX;
+            inst->FullDstRegisters[1] = inst->FullDstRegisters[0];
+            inst->FullDstRegisters[0].DstRegister.Index =
+                assembler->temp_count;
+            inst->FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+            r500_emit_maths(fs, assembler, inst->FullSrcRegisters,
+                    &inst->FullDstRegisters[0], TGSI_OPCODE_LG2, 1);
+            inst->FullSrcRegisters[0].SrcRegister.Index =
+                assembler->temp_count;
+            inst->FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+            inst->FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+            inst->FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+            inst->FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Z;
+            inst->FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_W;
+            inst->FullSrcRegisters[2] = r500_constant_zero;
+            r500_emit_maths(fs, assembler, inst->FullSrcRegisters,
+                    &inst->FullDstRegisters[0], TGSI_OPCODE_MUL, 3);
+            inst->FullDstRegisters[0] = inst->FullDstRegisters[1];
+            r500_emit_maths(fs, assembler, inst->FullSrcRegisters,
+                    &inst->FullDstRegisters[0], TGSI_OPCODE_EX2, 1);
+            break;
+
         /* The texture instruction set. */
         case TGSI_OPCODE_KIL:
         case TGSI_OPCODE_TEX:
@@ -593,7 +621,7 @@ static void r500_fs_instruction(struct r500_fragment_shader* fs,
 static void r300_fs_finalize(struct r3xx_fragment_shader* fs,
                              struct r300_fs_asm* assembler)
 {
-    fs->stack_size = assembler->temp_count + assembler->temp_offset;
+    fs->stack_size = assembler->temp_count + assembler->temp_offset + 1;
 }
 
 static void r500_fs_finalize(struct r500_fragment_shader* fs,
@@ -670,6 +698,7 @@ void r300_translate_fragment_shader(struct r300_context* r300,
             assembler->tex_count + assembler->color_count);
 
     consts->count = consts->user_count + assembler->imm_count;
+    fs->uses_imms = assembler->imm_count;
     debug_printf("r300: fs: %d total constants, "
             "%d from user and %d from immediates\n", consts->count,
             consts->user_count, assembler->imm_count);
diff --git a/src/gallium/drivers/r300/r300_state_shader.h b/src/gallium/drivers/r300/r300_state_shader.h
index 06260e61fe..b6087404ce 100644
--- a/src/gallium/drivers/r300/r300_state_shader.h
+++ b/src/gallium/drivers/r300/r300_state_shader.h
@@ -181,7 +181,7 @@ static struct r500_fragment_shader r500_texture_fragment_shader = {
     .instruction_count = 2,
     .instructions[0].inst0 = R500_INST_TYPE_TEX |
         R500_INST_TEX_SEM_WAIT |
-        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
+        R500_INST_RGB_WMASK_RGB | R500_INST_ALPHA_WMASK |
         R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
     .instructions[0].inst1 = R500_TEX_ID(0) | R500_TEX_INST_LD |
         R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED,
diff --git a/src/gallium/drivers/r300/r300_state_tcl.c b/src/gallium/drivers/r300/r300_state_tcl.c
index fdbcbf3db8..8cf8250425 100644
--- a/src/gallium/drivers/r300/r300_state_tcl.c
+++ b/src/gallium/drivers/r300/r300_state_tcl.c
@@ -144,6 +144,7 @@ static uint32_t r300_vs_op(unsigned op)
             return R300_VE_MULTIPLY;
         case TGSI_OPCODE_ADD:
         case TGSI_OPCODE_MOV:
+        case TGSI_OPCODE_SUB:
         case TGSI_OPCODE_SWZ:
             return R300_VE_ADD;
         case TGSI_OPCODE_MAX:
@@ -163,12 +164,14 @@ static uint32_t r300_vs_op(unsigned op)
 static uint32_t r300_vs_swiz(struct tgsi_full_src_register* reg)
 {
     if (reg->SrcRegister.Extended) {
-        return reg->SrcRegisterExtSwz.ExtSwizzleX |
+        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
+            reg->SrcRegisterExtSwz.ExtSwizzleX |
             (reg->SrcRegisterExtSwz.ExtSwizzleY << 3) |
             (reg->SrcRegisterExtSwz.ExtSwizzleZ << 6) |
             (reg->SrcRegisterExtSwz.ExtSwizzleW << 9);
     } else {
-        return reg->SrcRegister.SwizzleX |
+        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
+            reg->SrcRegister.SwizzleX |
             (reg->SrcRegister.SwizzleY << 3) |
             (reg->SrcRegister.SwizzleZ << 6) |
             (reg->SrcRegister.SwizzleW << 9);
@@ -179,12 +182,14 @@ static uint32_t r300_vs_swiz(struct tgsi_full_src_register* reg)
 static uint32_t r300_vs_scalar_swiz(struct tgsi_full_src_register* reg)
 {
     if (reg->SrcRegister.Extended) {
-        return reg->SrcRegisterExtSwz.ExtSwizzleX |
+        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
+            reg->SrcRegisterExtSwz.ExtSwizzleX |
             (reg->SrcRegisterExtSwz.ExtSwizzleX << 3) |
             (reg->SrcRegisterExtSwz.ExtSwizzleX << 6) |
             (reg->SrcRegisterExtSwz.ExtSwizzleX << 9);
     } else {
-        return reg->SrcRegister.SwizzleX |
+        return (reg->SrcRegister.Negate ? (0xf << 12) : 0) |
+            reg->SrcRegister.SwizzleX |
             (reg->SrcRegister.SwizzleX << 3) |
             (reg->SrcRegister.SwizzleX << 6) |
             (reg->SrcRegister.SwizzleX << 9);
@@ -246,6 +251,10 @@ static void r300_vs_instruction(struct r300_vertex_shader* vs,
                     &inst->FullDstRegisters[0], inst->Instruction.Opcode,
                     1, TRUE);
             break;
+        case TGSI_OPCODE_SUB:
+            inst->FullSrcRegisters[1].SrcRegister.Negate =
+                !inst->FullSrcRegisters[1].SrcRegister.Negate;
+            /* Fall through */
         case TGSI_OPCODE_ADD:
         case TGSI_OPCODE_MUL:
         case TGSI_OPCODE_MAX:
@@ -386,6 +395,7 @@ void r300_translate_vertex_shader(struct r300_context* r300,
             assembler->tex_count + assembler->color_count);
 
     consts->count = consts->user_count + assembler->imm_count;
+    vs->uses_imms = assembler->imm_count;
     debug_printf("r300: vs: %d total constants, "
             "%d from user and %d from immediates\n", consts->count,
             consts->user_count, assembler->imm_count);
diff --git a/src/gallium/drivers/r300/r300_state_tcl.h b/src/gallium/drivers/r300/r300_state_tcl.h
index d5d425e9d6..2c8b586c2f 100644
--- a/src/gallium/drivers/r300/r300_state_tcl.h
+++ b/src/gallium/drivers/r300/r300_state_tcl.h
@@ -76,6 +76,13 @@
     ((R300_PVS_SRC_SELECT_FORCE_1 | (R300_PVS_SRC_SELECT_FORCE_1 << 3) | \
      (R300_PVS_SRC_SELECT_FORCE_1 << 6) | \
       (R300_PVS_SRC_SELECT_FORCE_1 << 9)) << 13)
+#define R300_PVS_MODIFIER_X        (1 << 25)
+#define R300_PVS_MODIFIER_Y        (1 << 26)
+#define R300_PVS_MODIFIER_Z        (1 << 27)
+#define R300_PVS_MODIFIER_W        (1 << 28)
+#define R300_PVS_NEGATE_XYZW \
+    (R300_PVS_MODIFIER_X | R300_PVS_MODIFIER_Y | \
+     R300_PVS_MODIFIER_Z | R300_PVS_MODIFIER_W)
 
 static const struct tgsi_full_src_register r300_constant_zero = {
     .SrcRegister.Extended = TRUE,
diff --git a/src/gallium/drivers/r300/r300_surface.c b/src/gallium/drivers/r300/r300_surface.c
index acb6192492..c9e2dff14e 100644
--- a/src/gallium/drivers/r300/r300_surface.c
+++ b/src/gallium/drivers/r300/r300_surface.c
@@ -32,13 +32,6 @@ static void r300_surface_setup(struct r300_context* r300,
     unsigned pixpitch = dest->stride / dest->tex.block.size;
     CS_LOCALS(r300);
 
-    /* Make sure our target BO is okay. */
-    r300->winsys->add_buffer(r300->winsys, dest->buffer,
-            0, RADEON_GEM_DOMAIN_VRAM);
-    if (r300->winsys->validate(r300->winsys)) {
-        r300->context.flush(&r300->context, 0, NULL);
-    }
-
     r300_emit_blend_state(r300, &blend_clear_state);
     r300_emit_blend_color_state(r300, &blend_color_clear_state);
     r300_emit_dsa_state(r300, &dsa_clear_state);
@@ -106,6 +99,7 @@ static void r300_surface_fill(struct pipe_context* pipe,
     struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
     struct r300_texture* tex = (struct r300_texture*)dest->texture;
     unsigned pixpitch = tex->stride / tex->tex.block.size;
+    boolean invalid = FALSE;
     CS_LOCALS(r300);
 
     a = (float)((color >> 24) & 0xff) / 255.0f;
@@ -118,11 +112,28 @@ static void r300_surface_fill(struct pipe_context* pipe,
 
     /* Fallback? */
     if (FALSE) {
+fallback:
         debug_printf("r300: Falling back on surface clear...");
         util_surface_fill(pipe, dest, x, y, w, h, color);
         return;
     }
 
+    /* Make sure our target BO is okay. */
+validate:
+    if (!r300->winsys->add_buffer(r300->winsys, tex->buffer,
+                0, RADEON_GEM_DOMAIN_VRAM)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        goto validate;
+    }
+    if (r300->winsys->validate(r300->winsys)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        if (invalid) {
+            goto fallback;
+        }
+        invalid = TRUE;
+        goto validate;
+    }
+
     r300_surface_setup(r300, tex, x, y, w, h);
 
     /* Vertex shader setup */
@@ -216,6 +227,7 @@ static void r300_surface_copy(struct pipe_context* pipe,
     struct r300_texture* srctex = (struct r300_texture*)src->texture;
     struct r300_texture* desttex = (struct r300_texture*)dest->texture;
     unsigned pixpitch = srctex->stride / srctex->tex.block.size;
+    boolean invalid = FALSE;
     CS_LOCALS(r300);
 
     debug_printf("r300: Copying surface %p at (%d,%d) to %p at (%d, %d),"
@@ -225,21 +237,44 @@ static void r300_surface_copy(struct pipe_context* pipe,
     if ((srctex == desttex) &&
             ((destx < srcx + w) || (srcx < destx + w)) &&
             ((desty < srcy + h) || (srcy < desty + h))) {
+fallback:
         debug_printf("r300: Falling back on surface_copy\n");
         util_surface_copy(pipe, FALSE, dest, destx, desty, src,
                 srcx, srcy, w, h);
     }
 
-    /* Add our source texture to the BO list before emitting anything.
-     * r300_surface_setup will flush if needed for us. */
-    r300->winsys->add_buffer(r300->winsys, srctex->buffer,
-            RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+    /* Add our target BOs to the list. */
+validate:
+    if (!r300->winsys->add_buffer(r300->winsys, srctex->buffer,
+                RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        goto validate;
+    }
+    if (!r300->winsys->add_buffer(r300->winsys, desttex->buffer,
+                0, RADEON_GEM_DOMAIN_VRAM)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        goto validate;
+    }
+    if (r300->winsys->validate(r300->winsys)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        if (invalid) {
+            goto fallback;
+        }
+        invalid = TRUE;
+        goto validate;
+    }
 
     r300_surface_setup(r300, desttex, destx, desty, w, h);
 
+    /* Setup the texture. */
     r300_emit_sampler(r300, &r300_sampler_copy_state, 0);
     r300_emit_texture(r300, srctex, 0);
-    r300_flush_textures(r300);
+
+    /* Flush and enable. */
+    BEGIN_CS(4);
+    OUT_CS_REG(R300_TX_INVALTAGS, 0);
+    OUT_CS_REG(R300_TX_ENABLE, 0x1);
+    END_CS;
 
     /* Vertex shader setup */
     if (caps->has_tcl) {
@@ -263,7 +298,7 @@ static void r300_surface_copy(struct pipe_context* pipe,
         r300_emit_rs_block_state(r300, &r300_rs_block_copy_state);
     }
 
-    BEGIN_CS(28);
+    BEGIN_CS(30);
     /* VAP stream control, mapping from input memory to PVS/RS memory */
     if (caps->has_tcl) {
         OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
@@ -287,7 +322,7 @@ static void r300_surface_copy(struct pipe_context* pipe,
     OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_1, 0x2);
 
     /* Vertex size. */
-    OUT_CS_REG(R300_VAP_VTX_SIZE, 0x8);
+    OUT_CS_REG(R300_VAP_VTX_SIZE, 0x4);
 
     /* Packet3 with our texcoords */
     OUT_CS_PKT3(R200_3D_DRAW_IMMD_2, 16);
diff --git a/src/gallium/drivers/r300/r300_surface.h b/src/gallium/drivers/r300/r300_surface.h
index 894def07aa..9a4c39f58b 100644
--- a/src/gallium/drivers/r300/r300_surface.h
+++ b/src/gallium/drivers/r300/r300_surface.h
@@ -101,7 +101,7 @@ static struct r300_rs_block r300_rs_block_copy_state = {
         R500_RS_SEL_Q(R300_RS_SEL_K1),
     .inst[0] = R300_RS_INST_COL_CN_WRITE,
     .count = R300_IT_COUNT(2) | R300_IC_COUNT(0) | R300_HIRES_EN,
-    .inst_count = R300_RS_TX_OFFSET(6),
+    .inst_count = R300_RS_TX_OFFSET(0),
 };
 
 static struct r300_rs_block r500_rs_block_copy_state = {
@@ -111,7 +111,7 @@ static struct r300_rs_block r500_rs_block_copy_state = {
         R500_RS_SEL_Q(R500_RS_IP_PTR_K1),
     .inst[0] = R500_RS_INST_TEX_CN_WRITE,
     .count = R300_IT_COUNT(2) | R300_IC_COUNT(0) | R300_HIRES_EN,
-    .inst_count = R300_RS_TX_OFFSET(6),
+    .inst_count = R300_RS_TX_OFFSET(0),
 };
 
 static struct r300_sampler_state r300_sampler_copy_state = {
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index a5ced8041c..d2893c3b9d 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -52,10 +52,10 @@ struct r300_winsys {
     uint32_t vram_size;
 
     /* Add a pipe_buffer to the list of buffer objects to validate. */
-    void (*add_buffer)(struct r300_winsys* winsys,
-                       struct pipe_buffer* pbuffer,
-                       uint32_t rd,
-                       uint32_t wd);
+    boolean (*add_buffer)(struct r300_winsys* winsys,
+                          struct pipe_buffer* pbuffer,
+                          uint32_t rd,
+                          uint32_t wd);
 
     /* Revalidate all currently setup pipe_buffers.
      * Returns TRUE if a flush is required. */
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index eef6e5806c..06725fd09b 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -236,7 +236,6 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       }
       break;
 
-
    case PIPE_PRIM_TRIANGLES:
       for (i = 2; i < nr; i += 3) {
          setup_tri( setup_ctx,
@@ -256,7 +255,6 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-   case PIPE_PRIM_POLYGON:
       for (i = 2; i < nr; i += 1) {
          setup_tri( setup_ctx,
                     get_vert(vertex_buffer, indices[0], stride),
@@ -264,6 +262,7 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                     get_vert(vertex_buffer, indices[i-0], stride));
       }
       break;
+
    case PIPE_PRIM_QUADS:
       for (i = 3; i < nr; i += 4) {
          setup_tri( setup_ctx,
@@ -277,6 +276,7 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                     get_vert(vertex_buffer, indices[i-0], stride));
       }
       break;
+
    case PIPE_PRIM_QUAD_STRIP:
       for (i = 3; i < nr; i += 2) {
          setup_tri( setup_ctx,
@@ -290,6 +290,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                     get_vert(vertex_buffer, indices[i-0], stride));
       }
       break;
+
+   case PIPE_PRIM_POLYGON:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[0-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride),
+                    get_vert(vertex_buffer, indices[0], stride));
+      }
+      break;
+
    default:
       assert(0);
    }
@@ -378,7 +388,6 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-   case PIPE_PRIM_POLYGON:
       for (i = 2; i < nr; i += 1) {
          setup_tri( setup_ctx,
                     get_vert(vertex_buffer, 0, stride),
@@ -386,6 +395,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                     get_vert(vertex_buffer, i-0, stride));
       }
       break;
+
    case PIPE_PRIM_QUADS:
       for (i = 3; i < nr; i += 4) {
          setup_tri( setup_ctx,
@@ -412,6 +422,20 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                     get_vert(vertex_buffer, i-0, stride));
       }
       break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.  Note that the first polygon vertex is passed as
+       * the last triangle vertex here.
+       */
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride),
+                    get_vert(vertex_buffer, 0, stride));
+      }
+      break;
+
    default:
       assert(0);
    }
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index a32fd3a1ba..ce6d8ebd12 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -82,11 +82,11 @@ softpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
       return 1;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return 13; /* max 4Kx4K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return 13; /* max 4Kx4K */
    default:
       return 0;
    }
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index accc692b66..c6844a2649 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -504,6 +504,8 @@ static void print_vertex(const struct setup_context *setup,
 #endif
 
 /**
+ * Sort the vertices from top to bottom order, setting up the triangle
+ * edge fields (ebot, emaj, etop).
  * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
  */
 static boolean setup_sort_vertices( struct setup_context *setup,
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 9e19745889..7a533dad9f 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -42,6 +42,7 @@
 #include "sp_texture.h"
 #include "sp_tile_cache.h"
 #include "sp_screen.h"
+#include "sp_winsys.h"
 
 
 /* Simple, maximally packed layout.
diff --git a/src/gallium/drivers/trace/Makefile b/src/gallium/drivers/trace/Makefile
index ecb69fb996..4aeb8e3d7e 100644
--- a/src/gallium/drivers/trace/Makefile
+++ b/src/gallium/drivers/trace/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
 	tr_dump_state.c \
 	tr_screen.c \
 	tr_state.c \
+	tr_rbug.c \
 	tr_texture.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/trace/README b/src/gallium/drivers/trace/README
index 73dce20372..1000c31e49 100644
--- a/src/gallium/drivers/trace/README
+++ b/src/gallium/drivers/trace/README
@@ -3,7 +3,8 @@
 
 = About =
 
-This directory contains a Gallium3D pipe driver which traces all incoming calls.
+This directory contains a Gallium3D debugger pipe driver.
+It can traces all incoming calls and/or provide remote debugging functionality.
 
 
 = Build Instructions =
@@ -23,7 +24,9 @@ ensure the right libGL.so is being picked by doing
 
  ldd progs/trivial/tri 
 
-and then try running
+== Traceing ==
+
+For traceing then do
 
  export XMESA_TRACE=y
  GALLIUM_TRACE=tri.trace progs/trivial/tri
@@ -32,6 +35,16 @@ which should create a tri.trace file, which is an XML file. You can view copying
 trace.xsl to the same directory, and opening with a XSLT capable browser such as 
 Firefox or Internet Explorer.
 
+== Remote debugging ==
+
+For remote debugging
+
+ export XMESA_TRACE=y
+ GALLIUM_RBUG=true progs/trivial/tri
+
+which should open gallium remote debugging session. While the program is running
+you can launch the small remote debugging application from progs/rbug. More
+information is in that directory.
 
 = Integrating =
 
@@ -62,3 +75,4 @@ trace_screen with real_screen when creating them.
 
 --
 Jose Fonseca <jrfonseca@tungstengraphics.com>
+Jakob Bornecrantz <jakob@vmware.com>
diff --git a/src/gallium/drivers/trace/SConscript b/src/gallium/drivers/trace/SConscript
index 9b5af0d86f..e635fed77d 100644
--- a/src/gallium/drivers/trace/SConscript
+++ b/src/gallium/drivers/trace/SConscript
@@ -11,6 +11,7 @@ trace = env.ConvenienceLibrary(
         'tr_dump_state.c',
         'tr_screen.c',
         'tr_state.c',
+        'tr_rbug.c',
         'tr_texture.c',
     ])
 
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 2ad5ca4998..dd5cca58dd 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -113,6 +113,32 @@ trace_context_set_edgeflags(struct pipe_context *_pipe,
 }
 
 
+static INLINE void
+trace_context_draw_block(struct trace_context *tr_ctx, int flag)
+{
+   pipe_mutex_lock(tr_ctx->draw_mutex);
+
+   if (tr_ctx->draw_blocker & flag) {
+      tr_ctx->draw_blocked |= flag;
+
+      trace_rbug_notify_draw_blocked(tr_ctx);
+   }
+
+   /* wait for rbug to clear the blocked flag */
+   while (tr_ctx->draw_blocked & flag) {
+      tr_ctx->draw_blocked |= flag;
+#ifdef PIPE_THREAD_HAVE_CONDVAR
+      pipe_condvar_wait(tr_ctx->draw_cond, tr_ctx->draw_mutex);
+#else
+      pipe_mutex_unlock(tr_ctx->draw_mutex);
+      /* TODO sleep or use conditional */
+      pipe_mutex_lock(tr_ctx->draw_mutex);
+#endif
+   }
+
+   pipe_mutex_unlock(tr_ctx->draw_mutex);
+}
+
 static INLINE boolean
 trace_context_draw_arrays(struct pipe_context *_pipe,
                           unsigned mode, unsigned start, unsigned count)
@@ -124,6 +150,8 @@ trace_context_draw_arrays(struct pipe_context *_pipe,
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
       return 0;
 
+   trace_context_draw_block(tr_ctx, 1);
+
    trace_dump_call_begin("pipe_context", "draw_arrays");
 
    trace_dump_arg(ptr, pipe);
@@ -137,6 +165,8 @@ trace_context_draw_arrays(struct pipe_context *_pipe,
 
    trace_dump_call_end();
 
+   trace_context_draw_block(tr_ctx, 2);
+
    return result;
 }
 
@@ -156,6 +186,8 @@ trace_context_draw_elements(struct pipe_context *_pipe,
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
       return 0;
 
+   trace_context_draw_block(tr_ctx, 1);
+
    trace_screen_user_buffer_update(_pipe->screen, indexBuffer);
 
    trace_dump_call_begin("pipe_context", "draw_elements");
@@ -173,6 +205,8 @@ trace_context_draw_elements(struct pipe_context *_pipe,
 
    trace_dump_call_end();
 
+   trace_context_draw_block(tr_ctx, 2);
+
    return result;
 }
 
@@ -196,6 +230,8 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
       return 0;
 
+   trace_context_draw_block(tr_ctx, 1);
+
    trace_screen_user_buffer_update(_pipe->screen, indexBuffer);
 
    trace_dump_call_begin("pipe_context", "draw_range_elements");
@@ -218,6 +254,8 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
 
    trace_dump_call_end();
 
+   trace_context_draw_block(tr_ctx, 2);
+
    return result;
 }
 
@@ -782,6 +820,19 @@ trace_context_set_framebuffer_state(struct pipe_context *_pipe,
    struct pipe_framebuffer_state unwrapped_state;
    unsigned i;
 
+   {
+      tr_ctx->curr.nr_cbufs = state->nr_cbufs;
+      for (i = 0; i < state->nr_cbufs; i++)
+         if (state->cbufs[i])
+            tr_ctx->curr.cbufs[i] = trace_texture(state->cbufs[i]->texture);
+         else
+            tr_ctx->curr.cbufs[i] = NULL;
+      if (state->zsbuf)
+         tr_ctx->curr.zsbuf = trace_texture(state->zsbuf->texture);
+      else
+         tr_ctx->curr.zsbuf = NULL;
+   }
+
    /* Unwrap the input state */
    memcpy(&unwrapped_state, state, sizeof(unwrapped_state));
    for(i = 0; i < state->nr_cbufs; ++i)
@@ -1113,6 +1164,12 @@ trace_is_buffer_referenced( struct pipe_context *_pipe,
    return referenced;
 }
 
+static const struct debug_named_value rbug_blocker_flags[] = {
+   {"before", 1},
+   {"after", 2},
+   {NULL, 0},
+};
+
 struct pipe_context *
 trace_context_create(struct pipe_screen *_screen,
                      struct pipe_context *pipe)
@@ -1134,6 +1191,11 @@ trace_context_create(struct pipe_screen *_screen,
    if(!tr_ctx)
       goto error1;
 
+   tr_ctx->draw_blocker = debug_get_flags_option("RBUG_BLOCK",
+                                                 rbug_blocker_flags,
+                                                 0);
+   pipe_mutex_init(tr_ctx->draw_mutex);
+   pipe_condvar_init(tr_ctx->draw_cond);
    pipe_mutex_init(tr_ctx->list_mutex);
    make_empty_list(&tr_ctx->shaders);
 
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
index 86827f97b2..0c2bf27689 100644
--- a/src/gallium/drivers/trace/tr_context.h
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -50,8 +50,17 @@ struct trace_context
    struct {
       struct trace_shader *fs;
       struct trace_shader *vs;
+
+      unsigned nr_cbufs;
+      struct trace_texture *cbufs[PIPE_MAX_COLOR_BUFS];
+      struct trace_texture *zsbuf;
    } curr;
 
+   pipe_condvar draw_cond;
+   pipe_mutex draw_mutex;
+   int draw_blocker;
+   int draw_blocked;
+
    /* for list on screen */
    struct tr_list list;
 
@@ -75,6 +84,9 @@ struct pipe_context *
 trace_context_create(struct pipe_screen *screen,
                      struct pipe_context *pipe);
 
+void
+trace_rbug_notify_draw_blocked(struct trace_context *tr_ctx);
+
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/drivers/trace/tr_rbug.c b/src/gallium/drivers/trace/tr_rbug.c
new file mode 100644
index 0000000000..e2de108009
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_rbug.c
@@ -0,0 +1,811 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "util/u_network.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+#include "tr_buffer.h"
+#include "tr_texture.h"
+
+#include "rbug/rbug.h"
+
+#include <errno.h>
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+#  define sleep Sleep
+#elif defined(PIPE_OS_LINUX)
+void usleep(int);
+#  define sleep usleep
+#else
+#  warning "No socket implementation"
+#endif
+
+#define U642VOID(x) ((void *)(unsigned long)(x))
+#define VOID2U64(x) ((uint64_t)(unsigned long)(x))
+
+struct trace_rbug
+{
+   struct trace_screen *tr_scr;
+   struct rbug_connection *con;
+   pipe_thread thread;
+   boolean running;
+};
+
+PIPE_THREAD_ROUTINE(trace_rbug_thread, void_tr_rbug);
+
+
+/**********************************************************
+ * Helper functions
+ */
+
+
+static struct trace_context *
+trace_rbug_get_context_locked(struct trace_screen *tr_scr, rbug_context_t ctx)
+{
+   struct trace_context *tr_ctx = NULL;
+   struct tr_list *ptr;
+
+   foreach(ptr, &tr_scr->contexts) {
+      tr_ctx = (struct trace_context *)((char*)ptr - offsetof(struct trace_context, list));
+      if (ctx == VOID2U64(tr_ctx))
+         break;
+      tr_ctx = NULL;
+   }
+
+   return tr_ctx;
+}
+
+static struct trace_shader *
+trace_rbug_get_shader_locked(struct trace_context *tr_ctx, rbug_shader_t shdr)
+{
+   struct trace_shader *tr_shdr = NULL;
+   struct tr_list *ptr;
+
+   foreach(ptr, &tr_ctx->shaders) {
+      tr_shdr = (struct trace_shader *)((char*)ptr - offsetof(struct trace_shader, list));
+      if (shdr == VOID2U64(tr_shdr))
+         break;
+      tr_shdr = NULL;
+   }
+
+   return tr_shdr;
+}
+
+static void *
+trace_shader_create_locked(struct pipe_context *pipe,
+                           struct trace_shader *tr_shdr,
+                           struct tgsi_token *tokens)
+{
+   void *state = NULL;
+   struct pipe_shader_state pss = { 0 };
+   pss.tokens = tokens;
+
+   if (tr_shdr->type == TRACE_SHADER_FRAGMENT) {
+      state = pipe->create_fs_state(pipe, &pss);
+   } else if (tr_shdr->type == TRACE_SHADER_VERTEX) {
+      state = pipe->create_vs_state(pipe, &pss);
+   } else
+      assert(0);
+
+   return state;
+}
+
+static void
+trace_shader_bind_locked(struct pipe_context *pipe,
+                         struct trace_shader *tr_shdr,
+                         void *state)
+{
+   if (tr_shdr->type == TRACE_SHADER_FRAGMENT) {
+      pipe->bind_fs_state(pipe, state);
+   } else if (tr_shdr->type == TRACE_SHADER_VERTEX) {
+      pipe->bind_vs_state(pipe, state);
+   } else
+      assert(0);
+}
+
+static void
+trace_shader_delete_locked(struct pipe_context *pipe,
+                           struct trace_shader *tr_shdr,
+                           void *state)
+{
+   if (tr_shdr->type == TRACE_SHADER_FRAGMENT) {
+      pipe->delete_fs_state(pipe, state);
+   } else if (tr_shdr->type == TRACE_SHADER_VERTEX) {
+      pipe->delete_vs_state(pipe, state);
+   } else
+      assert(0);
+}
+
+/************************************************
+ * Request handler functions
+ */
+
+
+static int
+trace_rbug_texture_list(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_texture *tr_tex = NULL;
+   struct tr_list *ptr;
+   rbug_texture_t *texs;
+   int i = 0;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   texs = MALLOC(tr_scr->num_textures * sizeof(rbug_texture_t));
+   foreach(ptr, &tr_scr->textures) {
+      tr_tex = (struct trace_texture *)((char*)ptr - offsetof(struct trace_texture, list));
+      texs[i++] = VOID2U64(tr_tex);
+   }
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   rbug_send_texture_list_reply(tr_rbug->con, serial, texs, i, NULL);
+   FREE(texs);
+
+   return 0;
+}
+
+static int
+trace_rbug_texture_info(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_texture *tr_tex;
+   struct rbug_proto_texture_info *gpti = (struct rbug_proto_texture_info *)header;
+   struct tr_list *ptr;
+   struct pipe_texture *t;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   foreach(ptr, &tr_scr->textures) {
+      tr_tex = (struct trace_texture *)((char*)ptr - offsetof(struct trace_texture, list));
+      if (gpti->texture == VOID2U64(tr_tex))
+         break;
+      tr_tex = NULL;
+   }
+
+   if (!tr_tex) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   t = tr_tex->texture;
+   rbug_send_texture_info_reply(tr_rbug->con, serial,
+                               t->target, t->format,
+                               t->width, t->last_level + 1,
+                               t->height, t->last_level + 1,
+                               t->depth, t->last_level + 1,
+                               t->block.width, t->block.height, t->block.size,
+                               t->last_level,
+                               t->nr_samples,
+                               t->tex_usage,
+                               NULL);
+
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_texture_read(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_texture_read *gptr = (struct rbug_proto_texture_read *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_texture *tr_tex;
+   struct tr_list *ptr;
+
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_texture *tex;
+   struct pipe_transfer *t;
+
+   void *map;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   foreach(ptr, &tr_scr->textures) {
+      tr_tex = (struct trace_texture *)((char*)ptr - offsetof(struct trace_texture, list));
+      if (gptr->texture == VOID2U64(tr_tex))
+         break;
+      tr_tex = NULL;
+   }
+
+   if (!tr_tex) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   tex = tr_tex->texture;
+   t = screen->get_tex_transfer(tr_scr->screen, tex,
+                                gptr->face, gptr->level, gptr->zslice,
+                                PIPE_TRANSFER_READ,
+                                gptr->x, gptr->y, gptr->w, gptr->h);
+
+   map = screen->transfer_map(screen, t);
+
+   rbug_send_texture_read_reply(tr_rbug->con, serial,
+                                t->format,
+                                t->block.width, t->block.height, t->block.size,
+                                (uint8_t*)map, t->stride * t->nblocksy,
+                                t->stride,
+                                NULL);
+
+   screen->transfer_unmap(screen, t);
+   screen->tex_transfer_destroy(t);
+
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_context_list(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct tr_list *ptr;
+   struct trace_context *tr_ctx = NULL;
+   rbug_context_t *ctxs;
+   int i = 0;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   ctxs = MALLOC(tr_scr->num_contexts * sizeof(rbug_context_t));
+   foreach(ptr, &tr_scr->contexts) {
+      tr_ctx = (struct trace_context *)((char*)ptr - offsetof(struct trace_context, list));
+      ctxs[i++] = VOID2U64(tr_ctx);
+   }
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   rbug_send_context_list_reply(tr_rbug->con, serial, ctxs, i, NULL);
+   FREE(ctxs);
+
+   return 0;
+}
+
+static int
+trace_rbug_context_info(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_info *info = (struct rbug_proto_context_info *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+   rbug_texture_t cbufs[PIPE_MAX_COLOR_BUFS];
+   int i;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, info->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   /* protect the pipe context */
+   pipe_mutex_lock(tr_ctx->draw_mutex);
+   trace_dump_call_lock();
+
+   for (i = 0; i < tr_ctx->curr.nr_cbufs; i++)
+      cbufs[i] = VOID2U64(tr_ctx->curr.cbufs[i]);
+
+   rbug_send_context_info_reply(tr_rbug->con, serial,
+                                VOID2U64(tr_ctx->curr.vs), VOID2U64(tr_ctx->curr.fs),
+                                cbufs, tr_ctx->curr.nr_cbufs, VOID2U64(tr_ctx->curr.zsbuf),
+                                tr_ctx->draw_blocker, tr_ctx->draw_blocked, NULL);
+
+   trace_dump_call_unlock();
+   pipe_mutex_unlock(tr_ctx->draw_mutex);
+
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_context_draw_block(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_draw_block *block = (struct rbug_proto_context_draw_block *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, block->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(tr_ctx->draw_mutex);
+   tr_ctx->draw_blocker |= block->block;
+   pipe_mutex_unlock(tr_ctx->draw_mutex);
+
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_context_draw_step(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_draw_step *step = (struct rbug_proto_context_draw_step *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, step->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(tr_ctx->draw_mutex);
+   tr_ctx->draw_blocked &= ~step->step;
+   pipe_mutex_unlock(tr_ctx->draw_mutex);
+
+#ifdef PIPE_THREAD_HAVE_CONDVAR
+   pipe_condvar_broadcast(tr_ctx->draw_cond);
+#endif
+
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_context_draw_unblock(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_draw_unblock *unblock = (struct rbug_proto_context_draw_unblock *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, unblock->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(tr_ctx->draw_mutex);
+   tr_ctx->draw_blocked &= ~unblock->unblock;
+   tr_ctx->draw_blocker &= ~unblock->unblock;
+   pipe_mutex_unlock(tr_ctx->draw_mutex);
+
+#ifdef PIPE_THREAD_HAVE_CONDVAR
+   pipe_condvar_broadcast(tr_ctx->draw_cond);
+#endif
+
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_context_flush(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_flush *flush = (struct rbug_proto_context_flush *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, flush->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   /* protect the pipe context */
+   trace_dump_call_lock();
+
+   tr_ctx->pipe->flush(tr_ctx->pipe, flush->flags, NULL);
+
+   trace_dump_call_unlock();
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_shader_list(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_shader_list *list = (struct rbug_proto_shader_list *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+   struct trace_shader *tr_shdr = NULL;
+   struct tr_list *ptr;
+   rbug_shader_t *shdrs;
+   int i = 0;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, list->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(tr_ctx->list_mutex);
+   shdrs = MALLOC(tr_ctx->num_shaders * sizeof(rbug_shader_t));
+   foreach(ptr, &tr_ctx->shaders) {
+      tr_shdr = (struct trace_shader *)((char*)ptr - offsetof(struct trace_shader, list));
+      shdrs[i++] = VOID2U64(tr_shdr);
+   }
+
+   pipe_mutex_unlock(tr_ctx->list_mutex);
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   rbug_send_shader_list_reply(tr_rbug->con, serial, shdrs, i, NULL);
+   FREE(shdrs);
+
+   return 0;
+}
+
+static int
+trace_rbug_shader_info(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_shader_info *info = (struct rbug_proto_shader_info *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+   struct trace_shader *tr_shdr = NULL;
+   unsigned original_len;
+   unsigned replaced_len;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, info->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(tr_ctx->list_mutex);
+
+   tr_shdr = trace_rbug_get_shader_locked(tr_ctx, info->shader);
+
+   if (!tr_shdr) {
+      pipe_mutex_unlock(tr_ctx->list_mutex);
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   /* just in case */
+   assert(sizeof(struct tgsi_token) == 4);
+
+   original_len = tgsi_num_tokens(tr_shdr->tokens);
+   if (tr_shdr->replaced_tokens)
+      replaced_len = tgsi_num_tokens(tr_shdr->replaced_tokens);
+   else
+      replaced_len = 0;
+
+   rbug_send_shader_info_reply(tr_rbug->con, serial,
+                               (uint32_t*)tr_shdr->tokens, original_len,
+                               (uint32_t*)tr_shdr->replaced_tokens, replaced_len,
+                               tr_shdr->disabled,
+                               NULL);
+
+   pipe_mutex_unlock(tr_ctx->list_mutex);
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_shader_disable(struct trace_rbug *tr_rbug, struct rbug_header *header)
+{
+   struct rbug_proto_shader_disable *dis = (struct rbug_proto_shader_disable *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+   struct trace_shader *tr_shdr = NULL;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, dis->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(tr_ctx->list_mutex);
+
+   tr_shdr = trace_rbug_get_shader_locked(tr_ctx, dis->shader);
+
+   if (!tr_shdr) {
+      pipe_mutex_unlock(tr_ctx->list_mutex);
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   tr_shdr->disabled = dis->disable;
+
+   pipe_mutex_unlock(tr_ctx->list_mutex);
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+}
+
+static int
+trace_rbug_shader_replace(struct trace_rbug *tr_rbug, struct rbug_header *header)
+{
+   struct rbug_proto_shader_replace *rep = (struct rbug_proto_shader_replace *)header;
+
+   struct trace_screen *tr_scr = tr_rbug->tr_scr;
+   struct trace_context *tr_ctx = NULL;
+   struct trace_shader *tr_shdr = NULL;
+   struct pipe_context *pipe = NULL;
+   void *state;
+
+   pipe_mutex_lock(tr_scr->list_mutex);
+   tr_ctx = trace_rbug_get_context_locked(tr_scr, rep->context);
+
+   if (!tr_ctx) {
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(tr_ctx->list_mutex);
+
+   tr_shdr = trace_rbug_get_shader_locked(tr_ctx, rep->shader);
+
+   if (!tr_shdr) {
+      pipe_mutex_unlock(tr_ctx->list_mutex);
+      pipe_mutex_unlock(tr_scr->list_mutex);
+      return -ESRCH;
+   }
+
+   /* protect the pipe context */
+   trace_dump_call_lock();
+
+   pipe = tr_ctx->pipe;
+
+   /* remove old replaced shader */
+   if (tr_shdr->replaced) {
+      if (tr_ctx->curr.fs == tr_shdr || tr_ctx->curr.vs == tr_shdr)
+         trace_shader_bind_locked(pipe, tr_shdr, tr_shdr->state);
+
+      FREE(tr_shdr->replaced_tokens);
+      trace_shader_delete_locked(pipe, tr_shdr, tr_shdr->replaced);
+      tr_shdr->replaced = NULL;
+      tr_shdr->replaced_tokens = NULL;
+   }
+
+   /* empty inputs means restore old which we did above */
+   if (rep->tokens_len == 0)
+      goto out;
+
+   tr_shdr->replaced_tokens = tgsi_dup_tokens((struct tgsi_token *)rep->tokens);
+   if (!tr_shdr->replaced_tokens)
+      goto err;
+
+   state = trace_shader_create_locked(pipe, tr_shdr, tr_shdr->replaced_tokens);
+   if (!state)
+      goto err;
+
+   /* bind new shader if the shader is currently a bound */
+   if (tr_ctx->curr.fs == tr_shdr || tr_ctx->curr.vs == tr_shdr)
+      trace_shader_bind_locked(pipe, tr_shdr, state);
+
+   /* save state */
+   tr_shdr->replaced = state;
+
+out:
+   trace_dump_call_unlock();
+   pipe_mutex_unlock(tr_ctx->list_mutex);
+   pipe_mutex_unlock(tr_scr->list_mutex);
+
+   return 0;
+
+err:
+   FREE(tr_shdr->replaced_tokens);
+   tr_shdr->replaced = NULL;
+   tr_shdr->replaced_tokens = NULL;
+
+   trace_dump_call_unlock();
+   pipe_mutex_unlock(tr_ctx->list_mutex);
+   pipe_mutex_unlock(tr_scr->list_mutex);
+   return -EINVAL;
+}
+
+static boolean
+trace_rbug_header(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   int ret = 0;
+
+   switch(header->opcode) {
+      case RBUG_OP_PING:
+         rbug_send_ping_reply(tr_rbug->con, serial, NULL);
+         break;
+      case RBUG_OP_TEXTURE_LIST:
+         ret = trace_rbug_texture_list(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_TEXTURE_INFO:
+         ret = trace_rbug_texture_info(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_TEXTURE_READ:
+         ret = trace_rbug_texture_read(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_LIST:
+         ret = trace_rbug_context_list(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_INFO:
+         ret = trace_rbug_context_info(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_DRAW_BLOCK:
+         ret = trace_rbug_context_draw_block(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_DRAW_STEP:
+         ret = trace_rbug_context_draw_step(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_DRAW_UNBLOCK:
+         ret = trace_rbug_context_draw_unblock(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_FLUSH:
+         ret = trace_rbug_context_flush(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_SHADER_LIST:
+         ret = trace_rbug_shader_list(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_SHADER_INFO:
+         ret = trace_rbug_shader_info(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_SHADER_DISABLE:
+         ret = trace_rbug_shader_disable(tr_rbug, header);
+         break;
+      case RBUG_OP_SHADER_REPLACE:
+         ret = trace_rbug_shader_replace(tr_rbug, header);
+         break;
+      default:
+         debug_printf("%s - unsupported opcode %u\n", __FUNCTION__, header->opcode);
+         ret = -ENOSYS;
+         break;
+   }
+   rbug_free_header(header);
+
+   if (ret)
+      rbug_send_error_reply(tr_rbug->con, serial, ret, NULL);
+
+   return TRUE;
+}
+
+static void
+trace_rbug_con(struct trace_rbug *tr_rbug)
+{
+   struct rbug_header *header;
+   uint32_t serial;
+
+   debug_printf("%s - connection received\n", __FUNCTION__);
+
+   while(tr_rbug->running) {
+      header = rbug_get_message(tr_rbug->con, &serial);
+      if (!header)
+         break;
+
+      if (!trace_rbug_header(tr_rbug, header, serial))
+         break;
+   }
+
+   debug_printf("%s - connection closed\n", __FUNCTION__);
+
+   rbug_disconnect(tr_rbug->con);
+   tr_rbug->con = NULL;
+}
+
+PIPE_THREAD_ROUTINE(trace_rbug_thread, void_tr_rbug)
+{
+   struct trace_rbug *tr_rbug = void_tr_rbug;
+   uint16_t port = 13370;
+   int s = -1;
+   int c;
+
+   u_socket_init();
+
+   for (;port <= 13379 && s < 0; port++)
+      s = u_socket_listen_on_port(port);
+
+   if (s < 0) {
+      debug_printf("trace_rbug - failed to listen\n");
+      return NULL;
+   }
+
+   u_socket_block(s, false);
+
+   debug_printf("trace_rbug - remote debugging listening on port %u\n", --port);
+
+   while(tr_rbug->running) {
+      sleep(1);
+
+      c = u_socket_accept(s);
+      if (c < 0)
+         continue;
+
+      u_socket_block(c, true);
+      tr_rbug->con = rbug_from_socket(c);
+
+      trace_rbug_con(tr_rbug);
+
+      u_socket_close(c);
+   }
+
+   u_socket_close(s);
+
+   u_socket_stop();
+
+   return NULL;
+}
+
+/**********************************************************
+ *
+ */
+
+struct trace_rbug *
+trace_rbug_start(struct trace_screen *tr_scr)
+{
+   struct trace_rbug *tr_rbug = CALLOC_STRUCT(trace_rbug);
+   if (!tr_rbug)
+      return NULL;
+
+   tr_rbug->tr_scr = tr_scr;
+   tr_rbug->running = TRUE;
+   tr_rbug->thread = pipe_thread_create(trace_rbug_thread, tr_rbug);
+
+   return tr_rbug;
+}
+
+void
+trace_rbug_stop(struct trace_rbug *tr_rbug)
+{
+   if (!tr_rbug)
+      return;
+
+   tr_rbug->running = false;
+   pipe_thread_wait(tr_rbug->thread);
+
+   FREE(tr_rbug);
+
+   return;
+}
+
+void
+trace_rbug_notify_draw_blocked(struct trace_context *tr_ctx)
+{
+   struct trace_screen *tr_scr = trace_screen(tr_ctx->base.screen);
+   struct trace_rbug *tr_rbug = tr_scr->rbug;
+
+   if (tr_rbug && tr_rbug->con)
+      rbug_send_context_draw_blocked(tr_rbug->con,
+                                     VOID2U64(tr_ctx), tr_ctx->draw_blocked, NULL);
+}
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index bc14248eeb..920f418ebf 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -826,24 +826,26 @@ trace_screen_destroy(struct pipe_screen *_screen)
    trace_dump_call_end();
    trace_dump_trace_end();
 
+   if (tr_scr->rbug)
+      trace_rbug_stop(tr_scr->rbug);
+
    screen->destroy(screen);
 
    FREE(tr_scr);
 }
 
-
 boolean
 trace_enabled(void)
 {
    return trace;
 }
 
-
 struct pipe_screen *
 trace_screen_create(struct pipe_screen *screen)
 {
    struct trace_screen *tr_scr;
    struct pipe_winsys *winsys;
+   boolean rbug = FALSE;
 
    if(!screen)
       goto error1;
@@ -855,6 +857,11 @@ trace_screen_create(struct pipe_screen *screen)
       trace = TRUE;
    }
 
+   if (debug_get_bool_option("GALLIUM_RBUG", FALSE)) {
+      trace = TRUE;
+      rbug = TRUE;
+   }
+
    if (!trace)
       goto error1;
 
@@ -915,6 +922,9 @@ trace_screen_create(struct pipe_screen *screen)
    trace_dump_ret(ptr, screen);
    trace_dump_call_end();
 
+   if (rbug)
+      tr_scr->rbug = trace_rbug_start(tr_scr);
+
    return &tr_scr->base;
 
 #if 0
diff --git a/src/gallium/drivers/trace/tr_screen.h b/src/gallium/drivers/trace/tr_screen.h
index 7fae182985..dba8cd7c65 100644
--- a/src/gallium/drivers/trace/tr_screen.h
+++ b/src/gallium/drivers/trace/tr_screen.h
@@ -57,6 +57,9 @@ struct trace_screen
 
    struct pipe_screen *screen;
 
+   /* remote debugger */
+   struct trace_rbug *rbug;
+
    pipe_mutex list_mutex;
    int num_buffers;
    int num_contexts;
@@ -72,20 +75,33 @@ struct trace_screen
 
 
 /*
+ * tr_rbug.c
+ */
+
+
+struct trace_rbug;
+
+struct trace_rbug *
+trace_rbug_start(struct trace_screen *tr_scr);
+
+void
+trace_rbug_stop(struct trace_rbug *tr_rbug);
+
+
+/*
  * tr_screen.c
  */
 
+
 boolean
 trace_enabled(void);
 
 struct trace_screen *
 trace_screen(struct pipe_screen *screen);
 
-
 struct pipe_screen *
 trace_screen_create(struct pipe_screen *screen);
 
-
 void
 trace_screen_user_buffer_update(struct pipe_screen *screen,
                                 struct pipe_buffer *buffer);
@@ -106,6 +122,7 @@ trace_screen_user_buffer_update(struct pipe_screen *screen,
       pipe_mutex_unlock(tr_scr->list_mutex);             \
    } while (0)
 
+
 #ifdef __cplusplus
 }
 #endif
author	Dave Airlie <airlied@redhat.com>	2009-06-07 16:51:32 +1000
committer	Dave Airlie <airlied@redhat.com>	2009-06-07 16:51:32 +1000
commit	545e574cd9a2a659cd9a93879dff8884bd247558 (patch)
tree	f56d65eaa851edfb1248a6fc8ac0bae4cc98eff5 /src/gallium/drivers
parent	e2aedfa62079ff1a333e1f4e56faea303cc36edb (diff)
parent	f1edfa09ea50e8833ddbf241da4d36fd38685e9d (diff)