15 files changed, 1306 insertions, 494 deletions
diff --git a/src/gallium/drivers/nv50/nv50_clear.c b/src/gallium/drivers/nv50/nv50_clear.c
index 33427a15a5..e0b2d2880b 100644
--- a/src/gallium/drivers/nv50/nv50_clear.c
+++ b/src/gallium/drivers/nv50/nv50_clear.c
@@ -31,7 +31,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
 	   const float *rgba, double depth, unsigned stencil)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
 	unsigned mode = 0, i;
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index a511f655c1..e02afc4be9 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -33,7 +33,7 @@ nv50_flush(struct pipe_context *pipe, unsigned flags,
 {
 	struct nv50_context *nv50 = (struct nv50_context *)pipe;
 	
-	FIRE_RING(nv50->screen->nvws->channel);
+	FIRE_RING(nv50->screen->base.channel);
 }
 
 static void
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 7b67a75439..9b8cc4d37d 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -63,6 +63,11 @@ struct nv50_rasterizer_stateobj {
 	struct nouveau_stateobj *so;
 };
 
+struct nv50_sampler_stateobj {
+	bool normalized;
+	unsigned tsc[8];
+};
+
 struct nv50_miptree_level {
 	int *image_offset;
 	unsigned pitch;
@@ -70,7 +75,8 @@ struct nv50_miptree_level {
 
 struct nv50_miptree {
 	struct pipe_texture base;
-	struct pipe_buffer *buffer;
+
+	struct nouveau_bo *bo;
 
 	struct nv50_miptree_level level[PIPE_MAX_TEXTURE_LEVELS];
 	int image_nr;
@@ -93,13 +99,6 @@ nv50_surface(struct pipe_surface *pt)
 	return (struct nv50_surface *)pt;
 }
 
-static INLINE struct pipe_buffer *
-nv50_surface_buffer(struct pipe_surface *surface)
-{
-	struct nv50_miptree *mt = (struct nv50_miptree *)surface->texture;
-	return mt->buffer;
-}
-
 struct nv50_state {
 	unsigned dirty;
 
@@ -115,6 +114,7 @@ struct nv50_state {
 	unsigned viewport_bypass;
 	struct nouveau_stateobj *tsc_upload;
 	struct nouveau_stateobj *tic_upload;
+	unsigned miptree_nr;
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
 	struct nouveau_stateobj *vtxfmt;
@@ -147,7 +147,7 @@ struct nv50_context {
 	unsigned vtxbuf_nr;
 	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
 	unsigned vtxelt_nr;
-	unsigned *sampler[PIPE_MAX_SAMPLERS];
+	struct nv50_sampler_stateobj *sampler[PIPE_MAX_SAMPLERS];
 	unsigned sampler_nr;
 	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
 	unsigned miptree_nr;
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index f79a7ca86c..22465e0227 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -29,26 +29,36 @@
 static struct pipe_texture *
 nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 {
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
 	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
 	struct pipe_texture *pt = &mt->base;
-	unsigned usage, width = tmp->width[0], height = tmp->height[0];
+	unsigned width = tmp->width[0], height = tmp->height[0];
 	unsigned depth = tmp->depth[0];
-	int i, l;
+	uint32_t tile_mode, tile_flags, tile_h;
+	int ret, i, l;
 
 	mt->base = *tmp;
 	pipe_reference_init(&mt->base.reference, 1);
 	mt->base.screen = pscreen;
 
-	usage = PIPE_BUFFER_USAGE_PIXEL;
 	switch (pt->format) {
+	case PIPE_FORMAT_Z24X8_UNORM:
 	case PIPE_FORMAT_Z24S8_UNORM:
 	case PIPE_FORMAT_Z16_UNORM:
-		usage |= NOUVEAU_BUFFER_USAGE_ZETA;
+		tile_flags = 0x2800;
 		break;
 	default:
+		tile_flags = 0x7000;
 		break;
 	}
 
+	if      (pt->height[0] > 32) tile_mode = 4;
+	else if (pt->height[0] > 16) tile_mode = 3;
+	else if (pt->height[0] >  8) tile_mode = 2;
+	else if (pt->height[0] >  4) tile_mode = 1;
+	else                         tile_mode = 0;
+	tile_h = 1 << (tile_mode + 2);
+
 	switch (pt->target) {
 	case PIPE_TEXTURE_3D:
 		mt->image_nr = pt->depth[0];
@@ -85,7 +95,7 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 
 			size  = align(pt->width[l], 8) * pt->block.size;
 			size  = align(size, 64);
-			size *= align(pt->height[l], 8) * pt->block.size;
+			size *= align(pt->height[l], tile_h) * pt->block.size;
 
 			lvl->image_offset[i] = mt->total_size;
 
@@ -93,12 +103,13 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 		}
 	}
 
-	mt->buffer = pscreen->buffer_create(pscreen, 256, usage, mt->total_size);
-	if (!mt->buffer) {
+	ret = nouveau_bo_new_tile(dev, NOUVEAU_BO_VRAM, 256, mt->total_size,
+				  tile_mode, tile_flags, &mt->bo);
+	if (ret) {
 		FREE(mt);
 		return NULL;
 	}
-
+			     
 	return &mt->base;
 }
 
@@ -106,6 +117,7 @@ static struct pipe_texture *
 nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 		     const unsigned *stride, struct pipe_buffer *pb)
 {
+	struct nouveau_bo *bo = nouveau_bo(pb);
 	struct nv50_miptree *mt;
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
@@ -124,7 +136,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->level[0].pitch = *stride;
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
 
-	pipe_buffer_reference(&mt->buffer, pb);
+	nouveau_bo_ref(bo, &mt->bo);
 	return &mt->base;
 }
 
@@ -133,7 +145,7 @@ nv50_miptree_destroy(struct pipe_texture *pt)
 {
 	struct nv50_miptree *mt = nv50_miptree(pt);
 
-        pipe_buffer_reference(&mt->buffer, NULL);
+	nouveau_bo_ref(NULL, &mt->bo);
         FREE(mt);
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 2d15868ae8..4ec9c03305 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -85,6 +85,9 @@ struct nv50_reg {
 
 	int hw;
 	int neg;
+
+	int rhw; /* result hw for FP outputs, or interpolant index */
+	int acc; /* instruction where this reg is last read (first insn == 1) */
 };
 
 struct nv50_pc {
@@ -108,12 +111,23 @@ struct nv50_pc {
 
 	struct nv50_reg *temp_temp[16];
 	unsigned temp_temp_nr;
+
+	unsigned interp_mode[32];
+	/* perspective interpolation registers */
+	struct nv50_reg *iv_p;
+	struct nv50_reg *iv_c;
+
+	/* current instruction and total number of insns */
+	unsigned insn_cur;
+	unsigned insn_nr;
+
+	boolean allow32;
 };
 
 static void
 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 {
-	int i;
+	int i = 0;
 
 	if (reg->type == P_RESULT) {
 		if (pc->p->cfg.high_result < (reg->hw + 1))
@@ -131,7 +145,22 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 		return;
 	}
 
-	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+	if (reg->rhw != -1) {
+		/* try to allocate temporary with index rhw first */
+		if (!(pc->r_temp[reg->rhw])) {
+			pc->r_temp[reg->rhw] = reg;
+			reg->hw = reg->rhw;
+			if (pc->p->cfg.high_temp < (reg->rhw + 1))
+				pc->p->cfg.high_temp = reg->rhw + 1;
+			return;
+		}
+		/* make sure we don't get things like $r0 needs to go
+		 * in $r1 and $r1 in $r0
+		 */
+		i = pc->result_nr * 4;
+	}
+
+	for (; i < NV50_SU_MAX_TEMP; i++) {
 		if (!(pc->r_temp[i])) {
 			pc->r_temp[i] = reg;
 			reg->hw = i;
@@ -159,6 +188,7 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 			r->type = P_TEMP;
 			r->index = -1;
 			r->hw = i;
+			r->rhw = -1;
 			pc->r_temp[i] = r;
 			return r;
 		}
@@ -168,6 +198,38 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 	return NULL;
 }
 
+/* Assign the hw of the discarded temporary register src
+ * to the tgsi register dst and free src.
+ */
+static void
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	assert(src->index == -1 && src->hw != -1);
+
+	if (dst->hw != -1)
+		pc->r_temp[dst->hw] = NULL;
+	pc->r_temp[src->hw] = dst;
+	dst->hw = src->hw;
+
+	FREE(src);
+}
+
+/* release the hardware resource held by r */
+static void
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	assert(r->type == P_TEMP);
+	if (r->hw == -1)
+		return;
+
+	assert(pc->r_temp[r->hw] == r);
+	pc->r_temp[r->hw] = NULL;
+
+	r->acc = 0;
+	if (r->index == -1)
+		FREE(r);
+}
+
 static void
 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
 {
@@ -250,7 +312,13 @@ alloc_immd(struct nv50_pc *pc, float f)
 	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
 	unsigned hw;
 
-	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
+	for (hw = 0; hw < pc->immd_nr * 4; hw++)
+		if (pc->immd_buf[hw] == f)
+			break;
+
+	if (hw == pc->immd_nr * 4)
+		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
+
 	r->type = P_IMMD;
 	r->hw = hw;
 	r->index = -1;
@@ -341,7 +409,8 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
-	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+	float f = pc->immd_buf[imm->hw];
+	unsigned val = fui(imm->neg ? -f : f);
 
 	set_long(pc, e);
 	/*XXX: can't be predicated - bits overlap.. catch cases where both
@@ -354,20 +423,35 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 	e->inst[1] |= (val >> 6) << 2;
 }
 
+
+#define INTERP_LINEAR		0
+#define INTERP_FLAT			1
+#define INTERP_PERSPECTIVE	2
+#define INTERP_CENTROID		4
+
+/* interpolant index has been stored in dst->rhw */
 static void
-emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
-	    struct nv50_reg *src, struct nv50_reg *iv)
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
+		unsigned mode)
 {
+	assert(dst->rhw != -1);
 	struct nv50_program_exec *e = exec(pc);
 
 	e->inst[0] |= 0x80000000;
 	set_dst(pc, dst, e);
-	alloc_reg(pc, src);
-	e->inst[0] |= (src->hw << 16);
-	if (iv) {
-		e->inst[0] |= (1 << 25);
-		alloc_reg(pc, iv);
-		e->inst[0] |= (iv->hw << 9);
+	e->inst[0] |= (dst->rhw << 16);
+
+	if (mode & INTERP_FLAT) {
+		e->inst[0] |= (1 << 8);
+	} else {
+		if (mode & INTERP_PERSPECTIVE) {
+			e->inst[0] |= (1 << 25);
+			alloc_reg(pc, iv);
+			e->inst[0] |= (iv->hw << 9);
+		}
+
+		if (mode & INTERP_CENTROID)
+			e->inst[0] |= (1 << 24);
 	}
 
 	emit(pc, e);
@@ -378,22 +462,12 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
 	 struct nv50_program_exec *e)
 {
 	set_long(pc, e);
-#if 1
-	e->inst[1] |= (1 << 22);
-#else
-	if (src->type == P_IMMD) {
-		e->inst[1] |= (NV50_CB_PMISC << 22);
-	} else {
-		if (pc->p->type == PIPE_SHADER_VERTEX)
-			e->inst[1] |= (NV50_CB_PVP << 22);
-		else
-			e->inst[1] |= (NV50_CB_PFP << 22);
-	}
-#endif
 
 	e->param.index = src->hw;
 	e->param.shift = s;
 	e->param.mask = m << (s % 32);
+
+	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
 }
 
 static void
@@ -405,12 +479,11 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
 	set_dst(pc, dst, e);
 
-	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
 		set_immd(pc, src, e);
 		/*XXX: 32-bit, but steals part of "half" reg space - need to
 		 *     catch and handle this case if/when we do half-regs
 		 */
-		e->inst[0] |= 0x00008000;
 	} else
 	if (src->type == P_IMMD || src->type == P_CONST) {
 		set_long(pc, e);
@@ -426,18 +499,25 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 		e->inst[0] |= (src->hw << 9);
 	}
 
-	/* We really should support "half" instructions here at some point,
-	 * but I don't feel confident enough about them yet.
-	 */
-	set_long(pc, e);
 	if (is_long(e) && !is_immd(e)) {
 		e->inst[1] |= 0x04000000; /* 32-bit */
-		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
-	}
+		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
+		if (!(e->inst[1] & 0x20000000))
+			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
+	} else
+		e->inst[0] |= 0x00008000;
 
 	emit(pc, e);
 }
 
+static INLINE void
+emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
+{
+	struct nv50_reg *imm = alloc_immd(pc, f);
+	emit_mov(pc, dst, imm);
+	FREE(imm);
+}
+
 static boolean
 check_swap_src_0_1(struct nv50_pc *pc,
 		   struct nv50_reg **s0, struct nv50_reg **s1)
@@ -541,12 +621,26 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	struct nv50_program_exec *e = exec(pc);
 
 	e->inst[0] |= 0xc0000000;
-	set_long(pc, e);
+
+	if (!pc->allow32)
+		set_long(pc, e);
 
 	check_swap_src_0_1(pc, &src0, &src1);
 	set_dst(pc, dst, e);
 	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
+	if (src1->type == P_IMMD && !is_long(e)) {
+		if (src0->neg)
+			e->inst[0] |= 0x00008000;
+		set_immd(pc, src1, e);
+	} else {
+		set_src_1(pc, src1, e);
+		if (src0->neg ^ src1->neg) {
+			if (is_long(e))
+				e->inst[1] |= 0x08000000;
+			else
+				e->inst[0] |= 0x00008000;
+		}
+	}
 
 	emit(pc, e);
 }
@@ -560,11 +654,20 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
 	e->inst[0] |= 0xb0000000;
 
 	check_swap_src_0_1(pc, &src0, &src1);
+
+	if (!pc->allow32 || src0->neg || src1->neg) {
+		set_long(pc, e);
+		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
+	}
+
 	set_dst(pc, dst, e);
 	set_src_0(pc, src0, e);
-	if (is_long(e))
+	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
 		set_src_2(pc, src1, e);
 	else
+	if (src1->type == P_IMMD)
+		set_immd(pc, src1, e);
+	else
 		set_src_1(pc, src1, e);
 
 	emit(pc, e);
@@ -588,25 +691,13 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0xb0000000;
-
-	set_long(pc, e);
-	if (check_swap_src_0_1(pc, &src0, &src1))
-		e->inst[1] |= 0x04000000;
-	else
-		e->inst[1] |= 0x08000000;
-
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	set_src_2(pc, src1, e);
-
-	emit(pc, e);
+	src1->neg ^= 1;
+	emit_add(pc, dst, src0, src1);
+	src1->neg ^= 1;
 }
 
 static void
@@ -623,26 +714,21 @@ emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	set_src_1(pc, src1, e);
 	set_src_2(pc, src2, e);
 
+	if (src0->neg ^ src1->neg)
+		e->inst[1] |= 0x04000000;
+	if (src2->neg)
+		e->inst[1] |= 0x08000000;
+
 	emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0xe0000000;
-	set_long(pc, e);
-	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
-
-	check_swap_src_0_1(pc, &src0, &src1);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
-	set_src_2(pc, src2, e);
-
-	emit(pc, e);
+	src2->neg ^= 1;
+	emit_mad(pc, dst, src0, src1, src2);
+	src2->neg ^= 1;
 }
 
 static void
@@ -693,6 +779,48 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit(pc, e);
 }
 
+#define CVTOP_RN	0x01
+#define CVTOP_FLOOR	0x03
+#define CVTOP_CEIL	0x05
+#define CVTOP_TRUNC	0x07
+#define CVTOP_SAT	0x08
+#define CVTOP_ABS	0x10
+
+#define CVT_F32_F32 0xc4
+#define CVT_F32_S32 0x44
+#define CVT_F32_U32 0x64
+#define CVT_S32_F32 0x8c
+#define CVT_S32_S32 0x0c
+#define CVT_F32_F32_ROP 0xcc
+
+static void
+emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+	 int wp, unsigned cop, unsigned fmt)
+{
+	struct nv50_program_exec *e;
+
+	e = exec(pc);
+	set_long(pc, e);
+
+	e->inst[0] |= 0xa0000000;
+	e->inst[1] |= 0x00004000;
+	e->inst[1] |= (cop << 16);
+	e->inst[1] |= (fmt << 24);
+	set_src_0(pc, src, e);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
+	emit(pc, e);
+}
+
 static void
 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
 	 struct nv50_reg *src0, struct nv50_reg *src1)
@@ -736,22 +864,10 @@ emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
 		free_temp(pc, dst);
 }
 
-static void
+static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xa0000000; /* cvt */
-	set_long(pc, e);
-	e->inst[1] |= (6 << 29); /* cvt */
-	e->inst[1] |= 0x08000000; /* integer mode */
-	e->inst[1] |= 0x04000000; /* 32 bit */
-	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
-	e->inst[1] |= (1 << 14); /* src .f32 */
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-
-	emit(pc, e);
+	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
 }
 
 static void
@@ -768,21 +884,10 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
 	free_temp(pc, temp);
 }
 
-static void
+static INLINE void
 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xa0000000; /* cvt */
-	set_long(pc, e);
-	e->inst[1] |= (6 << 29); /* cvt */
-	e->inst[1] |= 0x04000000; /* 32 bit */
-	e->inst[1] |= (1 << 14); /* src .f32 */
-	e->inst[1] |= ((1 << 6) << 14); /* .abs */
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-
-	emit(pc, e);
+	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
 static void
@@ -794,18 +899,12 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
 	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
 	struct nv50_reg *tmp[4];
+	boolean allow32 = pc->allow32;
 
-	if (mask & (1 << 0))
-		emit_mov(pc, dst[0], one);
-
-	if (mask & (1 << 3))
-		emit_mov(pc, dst[3], one);
+	pc->allow32 = FALSE;
 
 	if (mask & (3 << 1)) {
-		if (mask & (1 << 1))
-			tmp[0] = dst[1];
-		else
-			tmp[0] = temp_temp(pc);
+		tmp[0] = alloc_temp(pc, NULL);
 		emit_minmax(pc, 4, tmp[0], src[0], zero);
 	}
 
@@ -823,6 +922,26 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 		emit_mov(pc, dst[2], zero);
 		set_pred(pc, 3, 0, pc->p->exec_tail);
 	}
+
+	if (mask & (1 << 1))
+		assimilate_temp(pc, dst[1], tmp[0]);
+	else
+	if (mask & (1 << 2))
+		free_temp(pc, tmp[0]);
+
+	pc->allow32 = allow32;
+
+	/* do this last, in case src[i,j] == dst[0,3] */
+	if (mask & (1 << 0))
+		emit_mov(pc, dst[0], one);
+
+	if (mask & (1 << 3))
+		emit_mov(pc, dst[3], one);
+
+	FREE(pos128);
+	FREE(neg128);
+	FREE(zero);
+	FREE(one);
 }
 
 static void
@@ -853,6 +972,8 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 	e->inst[1] = 0xc4014788;
 	set_src_0(pc, src, e);
 	set_pred_wr(pc, 1, r_pred, e);
+	if (src->neg)
+		e->inst[1] |= 0x20000000;
 	emit(pc, e);
 
 	/* This is probably KILP */
@@ -863,6 +984,180 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 	emit(pc, e);
 }
 
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
+{
+	struct nv50_reg *temp, *t[4];
+	struct nv50_program_exec *e;
+
+	unsigned c, mode, dim;
+
+	switch (type) {
+	case TGSI_TEXTURE_1D:
+		dim = 1;
+		break;
+	case TGSI_TEXTURE_UNKNOWN:
+	case TGSI_TEXTURE_2D:
+	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
+	case TGSI_TEXTURE_RECT:
+		dim = 2;
+		break;
+	case TGSI_TEXTURE_3D:
+	case TGSI_TEXTURE_CUBE:
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
+		dim = 3;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	alloc_temp4(pc, t, 0);
+
+	if (proj) {
+		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+			mode = pc->interp_mode[src[0]->index];
+
+			t[3]->rhw = src[3]->rhw;
+			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+			emit_flop(pc, 0, t[3], t[3]);
+
+			for (c = 0; c < dim; c++) {
+				t[c]->rhw = src[c]->rhw;
+				emit_interp(pc, t[c], t[3],
+					    (mode | INTERP_PERSPECTIVE));
+			}
+		} else {
+			emit_flop(pc, 0, t[3], src[3]);
+			for (c = 0; c < dim; c++)
+				emit_mul(pc, t[c], src[c], t[3]);
+
+			/* XXX: for some reason the blob sometimes uses MAD:
+			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
+			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
+			 */
+		}
+	} else {
+		if (type == TGSI_TEXTURE_CUBE) {
+			temp = temp_temp(pc);
+			emit_minmax(pc, 4, temp, src[0], src[1]);
+			emit_minmax(pc, 4, temp, temp, src[2]);
+			emit_flop(pc, 0, temp, temp);
+			for (c = 0; c < 3; c++)
+				emit_mul(pc, t[c], src[c], temp);
+		} else {
+			for (c = 0; c < dim; c++)
+				emit_mov(pc, t[c], src[c]);
+		}
+	}
+
+	e = exec(pc);
+	set_long(pc, e);
+	e->inst[0] |= 0xf0000000;
+	e->inst[1] |= 0x00000004;
+	set_dst(pc, t[0], e);
+	e->inst[0] |= (unit << 9);
+
+	if (dim == 2)
+		e->inst[0] |= 0x00400000;
+	else
+	if (dim == 3)
+		e->inst[0] |= 0x00800000;
+
+	e->inst[0] |= (mask & 0x3) << 25;
+	e->inst[1] |= (mask & 0xc) << 12;
+
+	emit(pc, e);
+
+#if 1
+	if (mask & 1) emit_mov(pc, dst[0], t[0]);
+	if (mask & 2) emit_mov(pc, dst[1], t[1]);
+	if (mask & 4) emit_mov(pc, dst[2], t[2]);
+	if (mask & 8) emit_mov(pc, dst[3], t[3]);
+
+	free_temp4(pc, t);
+#else
+	/* XXX: if p.e. MUL is used directly after TEX, it would still use
+	 * the texture coordinates, not the fetched values: latency ? */
+
+	for (c = 0; c < 4; c++) {
+		if (mask & (1 << c))
+			assimilate_temp(pc, dst[c], t[c]);
+		else
+			free_temp(pc, t[c]);
+	}
+#endif
+}
+
+static void
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	unsigned q = 0, m = ~0;
+
+	assert(!is_long(e));
+
+	switch (e->inst[0] >> 28) {
+	case 0x1:
+		/* MOV */
+		q = 0x0403c000;
+		m = 0xffff7fff;
+		break;
+	case 0x8:
+		/* INTERP */
+		m = ~0x02000000;
+		if (e->inst[0] & 0x02000000)
+			q = 0x00020000;
+		break;
+	case 0x9:
+		/* RCP */
+		break;
+	case 0xB:
+		/* ADD */
+		m = ~(127 << 16);
+		q = ((e->inst[0] & (~m)) >> 2);
+		break;
+	case 0xC:
+		/* MUL */
+		m = ~0x00008000;
+		q = ((e->inst[0] & (~m)) << 12);
+		break;
+	case 0xE:
+		/* MAD (if src2 == dst) */
+		q = ((e->inst[0] & 0x1fc) << 12);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	set_long(pc, e);
+	pc->p->exec_size++;
+
+	e->inst[0] &= m;
+	e->inst[1] |= q;
+}
+
+static boolean
+negate_supported(const struct tgsi_full_instruction *insn, int i)
+{
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_DP3:
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_MUL:
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_ADD:
+	case TGSI_OPCODE_SUB:
+	case TGSI_OPCODE_MAD:
+		return TRUE;
+	case TGSI_OPCODE_POW:
+		return (i == 1) ? TRUE : FALSE;
+	default:
+		return FALSE;
+	}
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -881,11 +1176,14 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 }
 
 static struct nv50_reg *
-tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
+	 boolean neg)
 {
 	struct nv50_reg *r = NULL;
 	struct nv50_reg *temp;
-	unsigned c;
+	unsigned sgn, c;
+
+	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
 
 	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
 	switch (c) {
@@ -915,16 +1213,17 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
 		break;
 	case TGSI_EXTSWIZZLE_ZERO:
 		r = alloc_immd(pc, 0.0);
-		break;
+		return r;
 	case TGSI_EXTSWIZZLE_ONE:
-		r = alloc_immd(pc, 1.0);
-		break;
+		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
+			return alloc_immd(pc, -1.0);
+		return alloc_immd(pc, 1.0);
 	default:
 		assert(0);
 		break;
 	}
 
-	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+	switch (sgn) {
 	case TGSI_UTIL_SIGN_KEEP:
 		break;
 	case TGSI_UTIL_SIGN_CLEAR:
@@ -933,14 +1232,21 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
 		r = temp;
 		break;
 	case TGSI_UTIL_SIGN_TOGGLE:
-		temp = temp_temp(pc);
-		emit_neg(pc, temp, r);
-		r = temp;
+		if (neg)
+			r->neg = 1;
+		else {
+			temp = temp_temp(pc);
+			emit_neg(pc, temp, r);
+			r = temp;
+		}
 		break;
 	case TGSI_UTIL_SIGN_SET:
 		temp = temp_temp(pc);
 		emit_abs(pc, temp, r);
-		emit_neg(pc, temp, r);
+		if (neg)
+			temp->neg = 1;
+		else
+			emit_neg(pc, temp, temp);
 		r = temp;
 		break;
 	default:
@@ -951,12 +1257,40 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
 	return r;
 }
 
+/* returns TRUE if instruction can overwrite sources before they're read */
+static boolean
+direct2dest_op(const struct tgsi_full_instruction *insn)
+{
+	if (insn->Instruction.Saturate)
+		return FALSE;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DP3:
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SCS:
+	case TGSI_OPCODE_SIN:
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+		return FALSE;
+	default:
+		return TRUE;
+	}
+}
+
 static boolean
 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
 	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
 	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
 	unsigned mask, sat, unit;
+	boolean assimilate = FALSE;
 	int i, c;
 
 	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
@@ -967,6 +1301,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
 		else
 			dst[c] = NULL;
+		rdst[c] = NULL;
+		src[0][c] = NULL;
+		src[1][c] = NULL;
+		src[2][c] = NULL;
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
@@ -976,7 +1314,8 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			unit = fs->SrcRegister.Index;
 
 		for (c = 0; c < 4; c++)
-			src[i][c] = tgsi_src(pc, c, fs);
+			src[i][c] = tgsi_src(pc, c, fs,
+					     negate_supported(inst, i));
 	}
 
 	if (sat) {
@@ -984,6 +1323,25 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			rdst[c] = dst[c];
 			dst[c] = temp_temp(pc);
 		}
+	} else
+	if (direct2dest_op(inst)) {
+		for (c = 0; c < 4; c++) {
+			if (!dst[c] || dst[c]->type != P_TEMP)
+				continue;
+
+			for (i = c + 1; i < 4; i++) {
+				if (dst[c] == src[0][i] ||
+				    dst[c] == src[1][i] ||
+				    dst[c] == src[2][i])
+					break;
+			}
+			if (i == 4)
+				continue;
+
+			assimilate = TRUE;
+			rdst[c] = dst[c];
+			dst[c] = alloc_temp(pc, NULL);
+		}
 	}
 
 	switch (inst->Instruction.Opcode) {
@@ -1002,7 +1360,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_COS:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, 5, temp, temp);
 		for (c = 0; c < 4; c++) {
@@ -1012,7 +1370,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_DP3:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
@@ -1021,10 +1379,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_DP4:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
@@ -1034,10 +1391,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_DPH:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
@@ -1047,7 +1403,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_DST:
 	{
@@ -1064,7 +1419,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	}
 		break;
 	case TGSI_OPCODE_EX2:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_preex2(pc, temp, src[0][0]);
 		emit_flop(pc, 6, temp, temp);
 		for (c = 0; c < 4; c++) {
@@ -1072,7 +1427,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -1082,26 +1436,26 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_FRC:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_flr(pc, temp, src[0][c]);
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_KIL:
 		emit_kil(pc, src[0][0]);
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
+		pc->p->cfg.fp.regs[2] |= 0x00100000;
 		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_flop(pc, 3, temp, src[0][0]);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
@@ -1110,15 +1464,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_LRP:
+		temp = temp_temp(pc);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			/*XXX: we can do better than this */
-			temp = alloc_temp(pc, NULL);
-			emit_neg(pc, temp, src[0][c]);
-			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
-			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
-			free_temp(pc, temp);
+			emit_sub(pc, temp, src[1][c], src[2][c]);
+			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
 		}
 		break;
 	case TGSI_OPCODE_MAD:
@@ -1157,36 +1508,39 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_POW:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_pow(pc, temp, src[0][0], src[1][0]);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_mov(pc, dst[c], temp);
 		}
-		free_temp(pc, temp);
 		break;
 	case TGSI_OPCODE_RCP:
-		for (c = 0; c < 4; c++) {
+		for (c = 3; c >= 0; c--) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_flop(pc, 0, dst[c], src[0][0]);
 		}
 		break;
 	case TGSI_OPCODE_RSQ:
-		for (c = 0; c < 4; c++) {
+		for (c = 3; c >= 0; c--) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_flop(pc, 2, dst[c], src[0][0]);
 		}
 		break;
 	case TGSI_OPCODE_SCS:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
 			emit_flop(pc, 5, dst[0], temp);
 		if (mask & (1 << 1))
 			emit_flop(pc, 4, dst[1], temp);
+		if (mask & (1 << 2))
+			emit_mov_immdval(pc, dst[2], 0.0);
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
 	case TGSI_OPCODE_SGE:
 		for (c = 0; c < 4; c++) {
@@ -1196,7 +1550,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_SIN:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, 4, temp, temp);
 		for (c = 0; c < 4; c++) {
@@ -1220,33 +1574,15 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_TEX:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->InstructionExtTexture.Texture, FALSE);
+		break;
 	case TGSI_OPCODE_TXP:
-	{
-		struct nv50_reg *t[4];
-		struct nv50_program_exec *e;
-
-		alloc_temp4(pc, t, 0);
-		emit_mov(pc, t[0], src[0][0]);
-		emit_mov(pc, t[1], src[0][1]);
-
-		e = exec(pc);
-		e->inst[0] = 0xf6400000;
-		e->inst[0] |= (unit << 9);
-		set_long(pc, e);
-		e->inst[1] |= 0x0000c004;
-		set_dst(pc, t[0], e);
-		emit(pc, e);
-
-		if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
-		if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
-		if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
-		if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
-
-		free_temp4(pc, t);
-	}
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->InstructionExtTexture.Texture, TRUE);
 		break;
 	case TGSI_OPCODE_XPD:
-		temp = alloc_temp(pc, NULL);
+		temp = temp_temp(pc);
 		if (mask & (1 << 0)) {
 			emit_mul(pc, temp, src[0][2], src[1][1]);
 			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
@@ -1259,7 +1595,8 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_mul(pc, temp, src[0][1], src[1][0]);
 			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
 		}
-		free_temp(pc, temp);
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
 	case TGSI_OPCODE_END:
 		break;
@@ -1270,21 +1607,26 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 
 	if (sat) {
 		for (c = 0; c < 4; c++) {
-			struct nv50_program_exec *e;
-
 			if (!(mask & (1 << c)))
 				continue;
-			e = exec(pc);
+			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
+				 CVT_F32_F32);
+		}
+	} else if (assimilate) {
+		for (c = 0; c < 4; c++)
+			if (rdst[c])
+				assimilate_temp(pc, rdst[c], dst[c]);
+	}
 
-			e->inst[0] = 0xa0000000; /* cvt */
-			set_long(pc, e);
-			e->inst[1] |= (6 << 29); /* cvt */
-			e->inst[1] |= 0x04000000; /* 32 bit */
-			e->inst[1] |= (1 << 14); /* src .f32 */
-			e->inst[1] |= ((1 << 5) << 14); /* .sat */
-			set_dst(pc, rdst[c], e);
-			set_src_0(pc, dst[c], e);
-			emit(pc, e);
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		for (c = 0; c < 4; c++) {
+			if (!src[i][c])
+				continue;
+			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
+				FREE(src[i][c]);
+			else
+			if (src[i][c]->acc == pc->insn_cur)
+				release_hw(pc, src[i][c]);
 		}
 	}
 
@@ -1292,12 +1634,169 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	return TRUE;
 }
 
+/* Adjust a bitmask that indicates what components of a source are used,
+ * we use this in tx_prep so we only load interpolants that are needed.
+ */
+static void
+insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
+{
+	const struct tgsi_instruction_ext_texture *tex;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_DP3:
+		*mask = 0x7;
+		break;
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+		*mask = 0xF;
+		break;
+	case TGSI_OPCODE_LIT:
+		*mask = 0xB;
+		break;
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+		*mask = 0x1;
+		break;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+		assert(insn->Instruction.Extended);
+		tex = &insn->InstructionExtTexture;
+
+		*mask = 0x7;
+		if (tex->Texture == TGSI_TEXTURE_1D)
+			*mask = 0x1;
+		else
+		if (tex->Texture == TGSI_TEXTURE_2D)
+			*mask = 0x3;
+
+		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+			*mask |= 0x8;
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
+		  unsigned *r_usage[2])
+{
+	const struct tgsi_full_instruction *insn;
+	const struct tgsi_full_src_register *src;
+	const struct tgsi_dst_register *dst;
+
+	unsigned i, c, k, n, mask, *acc_p;
+
+	insn = &tok->FullInstruction;
+	dst = &insn->FullDstRegisters[0].DstRegister;
+	mask = dst->WriteMask;
+
+	if (!r_usage[0])
+		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
+	if (!r_usage[1])
+		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+
+	if (dst->File == TGSI_FILE_TEMPORARY) {
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+		}
+	}
+
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		src = &insn->FullSrcRegisters[i];
+
+		switch (src->SrcRegister.File) {
+		case TGSI_FILE_TEMPORARY:
+			acc_p = r_usage[0];
+			break;
+		case TGSI_FILE_INPUT:
+			acc_p = r_usage[1];
+			break;
+		default:
+			continue;
+		}
+
+		insn_adjust_mask(insn, &mask);
+
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+
+			k = tgsi_util_get_full_src_register_extswizzle(src, c);
+			switch (k) {
+			case TGSI_EXTSWIZZLE_X:
+			case TGSI_EXTSWIZZLE_Y:
+			case TGSI_EXTSWIZZLE_Z:
+			case TGSI_EXTSWIZZLE_W:
+				n = src->SrcRegister.Index * 4 + k;
+				acc_p[n] = pc->insn_nr;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+}
+
+static unsigned
+load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
+	       int *aid, int *p_oid)
+{
+	struct nv50_reg *iv;
+	int oid, c, n;
+	unsigned mask = 0;
+
+	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+
+	for (c = 0, n = i * 4; c < 4; c++, n++) {
+		oid = (*p_oid)++;
+		pc->attr[n].type = P_TEMP;
+		pc->attr[n].index = i;
+
+		if (pc->attr[n].acc == acc[n])
+			continue;
+		mask |= (1 << c);
+
+		pc->attr[n].acc = acc[n];
+		pc->attr[n].rhw = pc->attr[n].hw = -1;
+		alloc_reg(pc, &pc->attr[n]);
+
+		pc->attr[n].rhw = (*aid)++;
+		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+
+		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
+		(*mid)++;
+		pc->p->cfg.fp.regs[1] += 0x00010001;
+	}
+
+	return mask;
+}
+
 static boolean
 nv50_program_tx_prep(struct nv50_pc *pc)
 {
 	struct tgsi_parse_context p;
 	boolean ret = FALSE;
 	unsigned i, c;
+	unsigned fcol, bcol, fcrd, depr;
+
+	/* count (centroid) perspective interpolations */
+	unsigned centroid_loads = 0;
+	unsigned perspect_loads = 0;
+
+	/* track register access for temps and attrs */
+	unsigned *r_usage[2];
+	r_usage[0] = NULL;
+	r_usage[1] = NULL;
+
+	depr = fcol = bcol = fcrd = 0xffff;
+
+	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+		pc->p->cfg.fp.regs[0] = 0x01000404;
+		pc->p->cfg.fp.regs[1] = 0x00000400;
+	}
 
 	tgsi_parse_init(&p, pc->p->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
@@ -1310,18 +1809,19 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			const struct tgsi_full_immediate *imm =
 				&p.FullToken.FullImmediate;
 
-			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
-				      imm->u.ImmediateFloat32[1].Float,
-				      imm->u.ImmediateFloat32[2].Float,
-				      imm->u.ImmediateFloat32[3].Float);
+			ctor_immd(pc, imm->u[0].Float,
+				      imm->u[1].Float,
+				      imm->u[2].Float,
+				      imm->u[3].Float);
 		}
 			break;
 		case TGSI_TOKEN_TYPE_DECLARATION:
 		{
 			const struct tgsi_full_declaration *d;
-			unsigned last;
+			unsigned last, first, mode;
 
 			d = &p.FullToken.FullDeclaration;
+			first = d->DeclarationRange.First;
 			last = d->DeclarationRange.Last;
 
 			switch (d->Declaration.File) {
@@ -1332,10 +1832,69 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			case TGSI_FILE_OUTPUT:
 				if (pc->result_nr < (last + 1))
 					pc->result_nr = last + 1;
+
+				if (!d->Declaration.Semantic)
+					break;
+
+				switch (d->Semantic.SemanticName) {
+				case TGSI_SEMANTIC_POSITION:
+					depr = first;
+					pc->p->cfg.fp.regs[2] |= 0x00000100;
+					pc->p->cfg.fp.regs[3] |= 0x00000011;
+					break;
+				default:
+					break;
+				}
+
 				break;
 			case TGSI_FILE_INPUT:
+			{
 				if (pc->attr_nr < (last + 1))
 					pc->attr_nr = last + 1;
+
+				if (pc->p->type != PIPE_SHADER_FRAGMENT)
+					break;
+
+				switch (d->Declaration.Interpolate) {
+				case TGSI_INTERPOLATE_CONSTANT:
+					mode = INTERP_FLAT;
+					break;
+				case TGSI_INTERPOLATE_PERSPECTIVE:
+					mode = INTERP_PERSPECTIVE;
+					break;
+				default:
+					mode = INTERP_LINEAR;
+					break;
+				}
+
+				if (d->Declaration.Semantic) {
+					switch (d->Semantic.SemanticName) {
+					case TGSI_SEMANTIC_POSITION:
+						fcrd = first;
+						break;
+					case TGSI_SEMANTIC_COLOR:
+						fcol = first;
+						mode = INTERP_PERSPECTIVE;
+						break;
+					case TGSI_SEMANTIC_BCOLOR:
+						bcol = first;
+						mode = INTERP_PERSPECTIVE;
+						break;
+					}
+				}
+
+				if (d->Declaration.Centroid) {
+					mode |= INTERP_CENTROID;
+					if (mode & INTERP_PERSPECTIVE)
+						centroid_loads++;
+				} else
+				if (mode & INTERP_PERSPECTIVE)
+					perspect_loads++;
+
+				assert(last < 32);
+				for (i = first; i <= last; i++)
+					pc->interp_mode[i] = mode;
+			}
 				break;
 			case TGSI_FILE_CONSTANT:
 				if (pc->param_nr < (last + 1))
@@ -1351,6 +1910,8 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		}
 			break;
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			pc->insn_nr++;
+			prep_inspect_insn(pc, tok, r_usage);
 			break;
 		default:
 			break;
@@ -1366,56 +1927,95 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			for (c = 0; c < 4; c++) {
 				pc->temp[i*4+c].type = P_TEMP;
 				pc->temp[i*4+c].hw = -1;
+				pc->temp[i*4+c].rhw = -1;
 				pc->temp[i*4+c].index = i;
+				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
 			}
 		}
 	}
 
 	if (pc->attr_nr) {
-		struct nv50_reg *iv = NULL;
-		int aid = 0;
+		int oid = 4, mid = 4, aid = 0;
+		/* oid = VP output id
+		 * aid = FP attribute/interpolant id
+		 * mid = VP output mapping field ID
+		 */
 
 		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
 		if (!pc->attr)
 			goto out_err;
 
 		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-			iv = alloc_temp(pc, NULL);
-			emit_interp(pc, iv, iv, NULL);
-			emit_flop(pc, 0, iv, iv);
-			aid++;
-		}
+			/* position should be loaded first */
+			if (fcrd != 0xffff) {
+				unsigned mask;
+				mid = 0;
+				mask = load_fp_attrib(pc, fcrd, r_usage[1],
+						      &mid, &aid, &oid);
+				oid = 0;
+				pc->p->cfg.fp.regs[1] |= (mask << 24);
+				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
+			}
+			pc->p->cfg.fp.map[0] += 0x03020100;
 
-		for (i = 0; i < pc->attr_nr; i++) {
-			struct nv50_reg *a = &pc->attr[i*4];
+			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
 
-			for (c = 0; c < 4; c++) {
-				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-					struct nv50_reg *at =
-						alloc_temp(pc, NULL);
-					pc->attr[i*4+c].type = at->type;
-					pc->attr[i*4+c].hw = at->hw;
-					pc->attr[i*4+c].index = at->index;
+			if (perspect_loads) {
+				pc->iv_p = alloc_temp(pc, NULL);
+
+				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
+					pc->p->cfg.fp.regs[1] |= 0x08000000;
+					pc->iv_p->rhw = aid++;
+					emit_interp(pc, pc->iv_p, NULL,
+						    INTERP_LINEAR);
+					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
 				} else {
-					pc->p->cfg.vp.attr[aid/32] |=
-						(1 << (aid % 32));
-					pc->attr[i*4+c].type = P_ATTR;
-					pc->attr[i*4+c].hw = aid++;
-					pc->attr[i*4+c].index = i;
+					pc->iv_p->rhw = aid - 1;
+					emit_flop(pc, 0, pc->iv_p,
+						  &pc->attr[fcrd * 4 + 3]);
 				}
 			}
 
-			if (pc->p->type != PIPE_SHADER_FRAGMENT)
-				continue;
+			if (centroid_loads) {
+				pc->iv_c = alloc_temp(pc, NULL);
+				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
+				emit_interp(pc, pc->iv_c, NULL,
+					    INTERP_CENTROID);
+				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
+				pc->p->cfg.fp.regs[1] |= 0x08000000;
+			}
 
-			emit_interp(pc, &a[0], &a[0], iv);
-			emit_interp(pc, &a[1], &a[1], iv);
-			emit_interp(pc, &a[2], &a[2], iv);
-			emit_interp(pc, &a[3], &a[3], iv);
-		}
+			for (c = 0; c < 4; c++) {
+				/* I don't know what these values do, but
+				 * let's set them like the blob does:
+				 */
+				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
+					pc->p->cfg.fp.regs[0] += 0x00010000;
+				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
+					pc->p->cfg.fp.regs[0] += 0x00010000;
+			}
 
-		if (iv)
-			free_temp(pc, iv);
+			for (i = 0; i < pc->attr_nr; i++)
+				load_fp_attrib(pc, i, r_usage[1],
+					       &mid, &aid, &oid);
+
+			if (pc->iv_p)
+				free_temp(pc, pc->iv_p);
+			if (pc->iv_c)
+				free_temp(pc, pc->iv_c);
+
+			pc->p->cfg.fp.high_map = (mid / 4);
+			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
+		} else {
+			/* vertex program */
+			for (i = 0; i < pc->attr_nr * 4; i++) {
+				pc->p->cfg.vp.attr[aid / 32] |=
+					(1 << (aid % 32));
+				pc->attr[i].type = P_ATTR;
+				pc->attr[i].hw = aid++;
+				pc->attr[i].index = i / 4;
+			}
+		}
 	}
 
 	if (pc->result_nr) {
@@ -1430,12 +2030,20 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
 					pc->result[i*4+c].type = P_TEMP;
 					pc->result[i*4+c].hw = -1;
+					pc->result[i*4+c].rhw = (i == depr) ?
+						-1 : rid++;
 				} else {
 					pc->result[i*4+c].type = P_RESULT;
 					pc->result[i*4+c].hw = rid++;
 				}
 				pc->result[i*4+c].index = i;
 			}
+
+			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
+			    depr != 0xffff) {
+				pc->result[depr * 4 + 2].rhw =
+					(pc->result_nr - 1) * 4;
+			}
 		}
 	}
 
@@ -1456,7 +2064,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 	}
 
 	if (pc->immd_nr) {
-		int rid = pc->param_nr * 4;
+		int rid = 0;
 
 		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
 		if (!pc->immd)
@@ -1473,15 +2081,38 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 
 	ret = TRUE;
 out_err:
+	if (r_usage[0])
+		FREE(r_usage[0]);
+	if (r_usage[1])
+		FREE(r_usage[1]);
+
 	tgsi_parse_free(&p);
 	return ret;
 }
 
+static void
+free_nv50_pc(struct nv50_pc *pc)
+{
+	if (pc->immd)
+		FREE(pc->immd);
+	if (pc->param)
+		FREE(pc->param);
+	if (pc->result)
+		FREE(pc->result);
+	if (pc->attr)
+		FREE(pc->attr);
+	if (pc->temp)
+		FREE(pc->temp);
+
+	FREE(pc);
+}
+
 static boolean
 nv50_program_tx(struct nv50_program *p)
 {
 	struct tgsi_parse_context parse;
 	struct nv50_pc *pc;
+	unsigned k;
 	boolean ret;
 
 	pc = CALLOC_STRUCT(nv50_pc);
@@ -1498,10 +2129,16 @@ nv50_program_tx(struct nv50_program *p)
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		const union tgsi_full_token *tok = &parse.FullToken;
 
+		/* don't allow half insn/immd on first and last instruction */
+		pc->allow32 = TRUE;
+		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
+			pc->allow32 = FALSE;
+
 		tgsi_parse_token(&parse);
 
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			++pc->insn_cur;
 			ret = nv50_program_tx_insn(pc, tok);
 			if (ret == FALSE)
 				goto out_err;
@@ -1515,8 +2152,40 @@ nv50_program_tx(struct nv50_program *p)
 		struct nv50_reg out;
 
 		out.type = P_TEMP;
-		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
-			emit_mov(pc, &out, &pc->result[out.hw]);
+		for (k = 0; k < pc->result_nr * 4; k++) {
+			if (pc->result[k].rhw == -1)
+				continue;
+			if (pc->result[k].hw != pc->result[k].rhw) {
+				out.hw = pc->result[k].rhw;
+				emit_mov(pc, &out, &pc->result[k]);
+			}
+			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
+				pc->p->cfg.high_result = pc->result[k].rhw + 1;
+		}
+	}
+
+	/* look for single half instructions and make them long */
+	struct nv50_program_exec *e, *e_prev;
+
+	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
+		if (!is_long(e))
+			k++;
+
+		if (!e->next || is_long(e->next)) {
+			if (k & 1)
+				convert_to_long(pc, e);
+			k = 0;
+		}
+
+		if (e->next)
+			e_prev = e;
+	}
+
+	if (!is_long(pc->p->exec_tail)) {
+		/* this may occur if moving FP results */
+		assert(e_prev && !is_long(e_prev));
+		convert_to_long(pc, e_prev);
+		convert_to_long(pc, pc->p->exec_tail);
 	}
 
 	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
@@ -1530,6 +2199,7 @@ out_err:
 	tgsi_parse_free(&parse);
 
 out_cleanup:
+	free_nv50_pc(pc);
 	return ret;
 }
 
@@ -1543,16 +2213,16 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
 
 static void
 nv50_program_upload_data(struct nv50_context *nv50, float *map,
-			 unsigned start, unsigned count)
+			unsigned start, unsigned count, unsigned cbuf)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	while (count) {
 		unsigned nr = count > 2047 ? 2047 : count;
 
 		BEGIN_RING(chan, tesla, 0x00000f00, 1);
-		OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
+		OUT_RING  (chan, (cbuf << 0) | (start << 8));
 		BEGIN_RING(chan, tesla, 0x40000f04, nr);
 		OUT_RINGp (chan, map, nr);
 
@@ -1565,70 +2235,93 @@ nv50_program_upload_data(struct nv50_context *nv50, float *map,
 static void
 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 {
-	struct nouveau_winsys *nvws = nv50->screen->nvws;
-	struct pipe_winsys *ws = nv50->pipe.winsys;
-	unsigned nr = p->param_nr + p->immd_nr;
+	struct pipe_screen *pscreen = nv50->pipe.screen;
 
-	if (!p->data && nr) {
-		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+	if (!p->data[0] && p->immd_nr) {
+		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
 
-		if (nvws->res_alloc(heap, nr, p, &p->data)) {
-			while (heap->next && heap->size < nr) {
+		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
+			while (heap->next && heap->size < p->immd_nr) {
 				struct nv50_program *evict = heap->next->priv;
-				nvws->res_free(&evict->data);
+				nouveau_resource_free(&evict->data[0]);
 			}
 
-			if (nvws->res_alloc(heap, nr, p, &p->data))
+			if (nouveau_resource_alloc(heap, p->immd_nr, p,
+						   &p->data[0]))
 				assert(0);
 		}
+
+		/* immediates only need to be uploaded again when freed */
+		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
+					 p->immd_nr, NV50_CB_PMISC);
 	}
 
-	if (p->param_nr) {
-		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
-					    PIPE_BUFFER_USAGE_CPU_READ);
-		nv50_program_upload_data(nv50, map, p->data->start,
-					 p->param_nr);
-		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
+	if (!p->data[1] && p->param_nr) {
+		struct nouveau_resource *heap =
+			nv50->screen->parm_heap[p->type];
+
+		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
+			while (heap->next && heap->size < p->param_nr) {
+				struct nv50_program *evict = heap->next->priv;
+				nouveau_resource_free(&evict->data[1]);
+			}
+
+			if (nouveau_resource_alloc(heap, p->param_nr, p,
+						   &p->data[1]))
+				assert(0);
+		}
 	}
 
-	if (p->immd_nr) {
-		nv50_program_upload_data(nv50, p->immd,
-					 p->data->start + p->param_nr,
-					 p->immd_nr);
+	if (p->param_nr) {
+		unsigned cbuf = NV50_CB_PVP;
+		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		if (p->type == PIPE_SHADER_FRAGMENT)
+			cbuf = NV50_CB_PFP;
+		nv50_program_upload_data(nv50, map, p->data[1]->start,
+					 p->param_nr, cbuf);
+		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
 	}
 }
 
 static void
 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct pipe_screen *screen = nv50->pipe.screen;
 	struct nv50_program_exec *e;
 	struct nouveau_stateobj *so;
 	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
 	unsigned start, count, *up, *ptr;
 	boolean upload = FALSE;
 
-	if (!p->buffer) {
-		p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
+	if (!p->bo) {
+		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
+			       p->exec_size * 4, &p->bo);
 		upload = TRUE;
 	}
 
-	if (p->data && p->data->start != p->data_start) {
+	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
+		(p->data[1] && p->data[1]->start != p->data_start[1])) {
 		for (e = p->exec_head; e; e = e->next) {
-			unsigned ei, ci;
+			unsigned ei, ci, bs;
 
 			if (e->param.index < 0)
 				continue;
+			bs = (e->inst[1] >> 22) & 0x07;
+			assert(bs < 2);
 			ei = e->param.shift >> 5;
-			ci = e->param.index + p->data->start;
+			ci = e->param.index + p->data[bs]->start;
 
 			e->inst[ei] &= ~e->param.mask;
 			e->inst[ei] |= (ci << e->param.shift);
 		}
 
-		p->data_start = p->data->start;
+		if (p->data[0])
+			p->data_start[0] = p->data[0]->start;
+		if (p->data[1])
+			p->data_start[1] = p->data[1]->start;
+
 		upload = TRUE;
 	}
 
@@ -1637,13 +2330,11 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 
 #ifdef NV50_PROGRAM_DUMP
 	NOUVEAU_ERR("-------\n");
-	up = ptr = MALLOC(p->exec_size * 4);
 	for (e = p->exec_head; e; e = e->next) {
 		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
 		if (is_long(e))
 			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
 	}
-
 #endif
 
 	up = ptr = MALLOC(p->exec_size * 4);
@@ -1655,20 +2346,20 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 
 	so = so_new(4,2);
 	so_method(so, nv50->screen->tesla, 0x1280, 3);
-	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
 
 	start = 0; count = p->exec_size;
 	while (count) {
-		struct nouveau_winsys *nvws = nv50->screen->nvws;
+		struct nouveau_channel *chan = nv50->screen->base.channel;
 		unsigned nr;
 
-		so_emit(nvws, so);
+		so_emit(chan, so);
 
 		nr = MIN2(count, 2047);
-		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
-		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
+		nr = MIN2(chan->pushbuf->remaining, nr);
+		if (chan->pushbuf->remaining < (nr + 3)) {
 			FIRE_RING(chan);
 			continue;
 		}
@@ -1704,10 +2395,10 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 
 	so = so_new(13, 2);
 	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, 0x1650, 2);
 	so_data  (so, p->cfg.vp.attr[0]);
 	so_data  (so, p->cfg.vp.attr[1]);
@@ -1728,6 +2419,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *p = nv50->fragprog;
 	struct nouveau_stateobj *so;
+	unsigned i;
 
 	if (!p->translated) {
 		nv50_program_validate(nv50, p);
@@ -1740,22 +2432,27 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 
 	so = so_new(64, 2);
 	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, 0x1904, 4);
-	so_data  (so, 0x00040404); /* p: 0x01000404 */
+	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
 	so_data  (so, 0x00000004);
 	so_data  (so, 0x00000000);
 	so_data  (so, 0x00000000);
-	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
-	so_data  (so, 0x03020100);
-	so_data  (so, 0x07060504);
-	so_data  (so, 0x0b0a0908);
+	so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
+	for (i = 0; i < p->cfg.fp.high_map; i++)
+		so_data(so, p->cfg.fp.map[i]);
 	so_method(so, tesla, 0x1988, 2);
-	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
+	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
 	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x1298, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, 0x19a8, 1);
+	so_data  (so, p->cfg.fp.regs[2]);
+	so_method(so, tesla, 0x196c, 1);
+	so_data  (so, p->cfg.fp.regs[3]);
 	so_method(so, tesla, 0x1414, 1);
 	so_data  (so, 0); /* program start offset */
 	so_ref(so, &nv50->state.fragprog);
@@ -1765,8 +2462,6 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 void
 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
-	struct pipe_screen *pscreen = nv50->pipe.screen;
-
 	while (p->exec_head) {
 		struct nv50_program_exec *e = p->exec_head;
 
@@ -1776,10 +2471,10 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 	p->exec_tail = NULL;
 	p->exec_size = 0;
 
-	if (p->buffer)
-		pipe_buffer_reference(&p->buffer, NULL);
+	nouveau_bo_ref(NULL, &p->bo);
 
-	nv50->screen->nvws->res_free(&p->data);
+	nouveau_resource_free(&p->data[0]);
+	nouveau_resource_free(&p->data[1]);
 
 	p->translated = 0;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 78deed6a38..096e0476aa 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -24,10 +24,10 @@ struct nv50_program {
 	struct nv50_program_exec *exec_head;
 	struct nv50_program_exec *exec_tail;
 	unsigned exec_size;
-	struct nouveau_resource *data;
-	unsigned data_start;
+	struct nouveau_resource *data[2];
+	unsigned data_start[2];
 
-	struct pipe_buffer *buffer;
+	struct nouveau_bo *bo;
 
 	float *immd;
 	unsigned immd_nr;
@@ -39,6 +39,11 @@ struct nv50_program {
 		struct {
 			unsigned attr[2];
 		} vp;
+		struct {
+			unsigned regs[4];
+			unsigned map[5];
+			unsigned high_map;
+		} fp;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 35cebdbdc3..940e04365f 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -26,7 +26,7 @@
 #include "nv50_context.h"
 
 struct nv50_query {
-	struct pipe_buffer *buffer;
+	struct nouveau_bo *bo;
 	unsigned type;
 	boolean ready;
 	uint64_t result;
@@ -41,14 +41,16 @@ nv50_query(struct pipe_query *pipe)
 static struct pipe_query *
 nv50_query_create(struct pipe_context *pipe, unsigned type)
 {
-	struct pipe_screen *screen = pipe->screen;
+	struct nouveau_device *dev = nouveau_screen(pipe->screen)->device;
 	struct nv50_query *q = CALLOC_STRUCT(nv50_query);
+	int ret;
 
 	assert (q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 	q->type = type;
 
-	q->buffer = screen->buffer_create(screen, 256, 0, 16);
-	if (!q->buffer) {
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 256,
+			     16, &q->bo);
+	if (ret) {
 		FREE(q);
 		return NULL;
 	}
@@ -62,7 +64,7 @@ nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
 	struct nv50_query *q = nv50_query(pq);
 
 	if (q) {
-		pipe_buffer_reference(&q->buffer, NULL);
+		nouveau_bo_ref(NULL, &q->bo);
 		FREE(q);
 	}
 }
@@ -71,7 +73,7 @@ static void
 nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_query *q = nv50_query(pq);
 
@@ -87,15 +89,14 @@ static void
 nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->base.channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_query *q = nv50_query(pq);
-	struct nouveau_bo *bo = nv50->screen->nvws->get_bo(q->buffer);
 
 	WAIT_RING (chan, 5);
 	BEGIN_RING(chan, tesla, 0x1b00, 4);
-	OUT_RELOCh(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	OUT_RING  (chan, 0x00000000);
 	OUT_RING  (chan, 0x0100f002);
 	FIRE_RING (chan);
@@ -105,7 +106,6 @@ static boolean
 nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 		  boolean wait, uint64_t *result)
 {
-	struct pipe_winsys *ws = pipe->winsys;
 	struct nv50_query *q = nv50_query(pq);
 
 	/*XXX: Want to be able to return FALSE here instead of blocking
@@ -113,11 +113,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 	 */
 
 	if (!q->ready) {
-		uint32_t *map = ws->buffer_map(ws, q->buffer,
-					       PIPE_BUFFER_USAGE_CPU_READ);
-		q->result = map[1];
+		nouveau_bo_map(q->bo, NOUVEAU_BO_RD);
+		q->result = ((uint32_t *)q->bo->map)[1];
 		q->ready = TRUE;
-		ws->buffer_unmap(ws, q->buffer);
+		nouveau_bo_unmap(q->bo);
 	}
 
 	*result = q->result;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 2980564594..ce8f906b15 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -22,8 +22,6 @@
 
 #include "pipe/p_screen.h"
 
-#include "util/u_simple_screen.h"
-
 #include "nv50_context.h"
 #include "nv50_screen.h"
 
@@ -39,7 +37,15 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 		switch (format) {
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
 		case PIPE_FORMAT_R5G6B5_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else
+	if (tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL) {
+		switch (format) {
 		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z24X8_UNORM:
 		case PIPE_FORMAT_Z16_UNORM:
 			return TRUE;
 		default:
@@ -68,23 +74,6 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 	return FALSE;
 }
 
-static const char *
-nv50_screen_get_name(struct pipe_screen *pscreen)
-{
-	struct nv50_screen *screen = nv50_screen(pscreen);
-	struct nouveau_device *dev = screen->nvws->channel->device;
-	static char buffer[128];
-
-	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
-}
-
-static const char *
-nv50_screen_get_vendor(struct pipe_screen *pscreen)
-{
-	return "nouveau";
-}
-
 static int
 nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 {
@@ -120,6 +109,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
 		return 0;
+	case PIPE_CAP_TGSI_CONT_SUPPORTED:
+		return 0;
+	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+		return 1;
 	case NOUVEAU_CAP_HW_VTXBUF:
 		return 1;
 	case NOUVEAU_CAP_HW_IDXBUF:
@@ -153,37 +146,64 @@ nv50_screen_get_paramf(struct pipe_screen *pscreen, int param)
 static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
-	FREE(pscreen);
+	struct nv50_screen *screen = nv50_screen(pscreen);
+
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->tesla);
+	nouveau_grobj_free(&screen->eng2d);
+	nouveau_grobj_free(&screen->m2mf);
+	nouveau_screen_fini(&screen->base);
+	FREE(screen);
 }
 
 struct pipe_screen *
-nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
 	struct nouveau_stateobj *so;
-	unsigned tesla_class = 0, ret;
-	unsigned chipset = nvws->channel->device->chipset;
-	int i;
+	unsigned chipset = dev->chipset;
+	unsigned tesla_class = 0;
+	int ret, i;
 
 	if (!screen)
 		return NULL;
-	screen->nvws = nvws;
+	pscreen = &screen->base.base;
+
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv50_screen_destroy;
+	pscreen->get_param = nv50_screen_get_param;
+	pscreen->get_paramf = nv50_screen_get_paramf;
+	pscreen->is_format_supported = nv50_screen_is_format_supported;
+
+	nv50_screen_init_miptree_functions(pscreen);
+	nv50_transfer_init_screen_functions(pscreen);
 
 	/* DMA engine object */
-	ret = nvws->grobj_alloc(nvws, 0x5039, &screen->m2mf);
+	ret = nouveau_grobj_alloc(chan, 0xbeef5039, 0x5039, &screen->m2mf);
 	if (ret) {
 		NOUVEAU_ERR("Error creating M2MF object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
+	BIND_RING(chan, screen->m2mf, 1);
 
 	/* 2D object */
-	ret = nvws->grobj_alloc(nvws, NV50_2D, &screen->eng2d);
+	ret = nouveau_grobj_alloc(chan, 0xbeef502d, 0x502d, &screen->eng2d);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 2D object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
+	BIND_RING(chan, screen->eng2d, 2);
 
 	/* 3D object */
 	switch (chipset & 0xf0) {
@@ -199,70 +219,55 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 		break;
 	default:
 		NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", chipset);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
 	if (tesla_class == 0) {
 		NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	ret = nvws->grobj_alloc(nvws, tesla_class, &screen->tesla);
+	ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class, &screen->tesla);
 	if (ret) {
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
+	BIND_RING(chan, screen->tesla, 3);
 
 	/* Sync notifier */
-	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv50_screen_destroy(&screen->pipe);
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
-        /* Setup the pipe */
-	screen->pipe.winsys = ws;
-
-	screen->pipe.destroy = nv50_screen_destroy;
-
-	screen->pipe.get_name = nv50_screen_get_name;
-	screen->pipe.get_vendor = nv50_screen_get_vendor;
-	screen->pipe.get_param = nv50_screen_get_param;
-	screen->pipe.get_paramf = nv50_screen_get_paramf;
-
-	screen->pipe.is_format_supported = nv50_screen_is_format_supported;
-
-	nv50_screen_init_miptree_functions(&screen->pipe);
-	nv50_transfer_init_screen_functions(&screen->pipe);
-	u_simple_screen_init(&screen->pipe);
-
 	/* Static M2MF init */
 	so = so_new(32, 0);
 	so_method(so, screen->m2mf, 0x0180, 3);
 	so_data  (so, screen->sync->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_emit(nvws, so);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_emit(chan, so);
 	so_ref (NULL, &so);
 
 	/* Static 2D init */
 	so = so_new(64, 0);
 	so_method(so, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
 	so_data  (so, screen->sync->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
-	so_data  (so, screen->nvws->channel->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
 	so_method(so, screen->eng2d, NV50_2D_OPERATION, 1);
 	so_data  (so, NV50_2D_OPERATION_SRCCOPY);
 	so_method(so, screen->eng2d, 0x0290, 1);
 	so_data  (so, 0);
 	so_method(so, screen->eng2d, 0x0888, 1);
 	so_data  (so, 1);
-	so_emit(nvws, so);
+	so_emit(chan, so);
 	so_ref(NULL, &so);
 
 	/* Static tesla init */
@@ -275,11 +280,11 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
 				     NV50TCL_DMA_UNK0__SIZE);
 	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
-		so_data(so, nvws->channel->vram->handle);
+		so_data(so, chan->vram->handle);
 	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
 				     NV50TCL_DMA_UNK1__SIZE);
 	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
-		so_data(so, nvws->channel->vram->handle);
+		so_data(so, chan->vram->handle);
 	so_method(so, screen->tesla, 0x121c, 1);
 	so_data  (so, 1);
 
@@ -290,27 +295,81 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 	so_method(so, screen->tesla, 0x16b8, 1);
 	so_data  (so, 8);
 
-	/* Shared constant buffer */
-	screen->constbuf = screen->pipe.buffer_create(&screen->pipe, 0, 0, 128 * 4 * 4);
-	if (nvws->res_init(&screen->vp_data_heap, 0, 128)) {
-		NOUVEAU_ERR("Error initialising constant buffer\n");
-		nv50_screen_destroy(&screen->pipe);
+	/* constant buffers for immediates and VP/FP parameters */
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4,
+			     &screen->constbuf_misc[0]);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	for (i = 0; i < 2; i++) {
+		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4,
+				     &screen->constbuf_parm[i]);
+		if (ret) {
+			nv50_screen_destroy(pscreen);
+			return NULL;
+		}
+	}
+
+	if (nouveau_resource_init(&screen->immd_heap[0], 0, 128) ||
+		nouveau_resource_init(&screen->parm_heap[0], 0, 128) ||
+		nouveau_resource_init(&screen->parm_heap[1], 0, 128))
+	{
+		NOUVEAU_ERR("Error initialising constant buffers.\n");
+		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
+	/*
+	// map constant buffers:
+	//  B = buffer ID (maybe more than 1 byte)
+	//  N = CB index used in shader instruction
+	//  P = program type (0 = VP, 2 = GP, 3 = FP)
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x000BBNP1);
+	*/
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PMISC << 16) | 0x00000800);
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000001 | (NV50_CB_PMISC << 12));
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000031 | (NV50_CB_PMISC << 12));
+
 	so_method(so, screen->tesla, 0x1280, 3);
-	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, (NV50_CB_PMISC << 16) | 0x00001000);
+	so_data  (so, (NV50_CB_PVP << 16) | 0x00000800);
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000101 | (NV50_CB_PVP << 12));
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PFP << 16) | 0x00000800);
+	so_method(so, screen->tesla, 0x1694, 1);
+	so_data  (so, 0x00000131 | (NV50_CB_PFP << 12));
 
 	/* Texture sampler/image unit setup - we abuse the constant buffer
 	 * upload mechanism for the moment to upload data to the tex config
 	 * blocks.  At some point we *may* want to go the NVIDIA way of doing
 	 * things?
 	 */
-	screen->tic = screen->pipe.buffer_create(&screen->pipe, 0, 0, 32 * 8 * 4);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tic);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
 	so_method(so, screen->tesla, 0x1280, 3);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
@@ -324,7 +383,12 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, 0x00000800);
 
-	screen->tsc = screen->pipe.buffer_create(&screen->pipe, 0, 0, 32 * 8 * 4);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tsc);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
 	so_method(so, screen->tesla, 0x1280, 3);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
@@ -352,14 +416,12 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
 
 	so_method(so, screen->tesla, 0x1234, 1);
 	so_data  (so, 1);
-	so_method(so, screen->tesla, 0x1458, 1);
-	so_data  (so, 1);
 
-	so_emit(nvws, so);
+	so_emit(chan, so);
 	so_ref (so, &screen->static_init);
 	so_ref (NULL, &so);
-	nvws->push_flush(nvws, 0, NULL);
+	nouveau_pushbuf_flush(chan, 0);
 
-	return &screen->pipe;
+	return pscreen;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index db567aaac8..61e24a5b57 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -1,10 +1,10 @@
 #ifndef __NV50_SCREEN_H__
 #define __NV50_SCREEN_H__
 
-#include "pipe/p_screen.h"
+#include "nouveau/nouveau_screen.h"
 
 struct nv50_screen {
-	struct pipe_screen pipe;
+	struct nouveau_screen base;
 
 	struct nouveau_winsys *nvws;
 
@@ -15,11 +15,14 @@ struct nv50_screen {
 	struct nouveau_grobj *m2mf;
 	struct nouveau_notifier *sync;
 
-	struct pipe_buffer *constbuf;
-	struct nouveau_resource *vp_data_heap;
+	struct nouveau_bo *constbuf_misc[1];
+	struct nouveau_bo *constbuf_parm[2];
 
-	struct pipe_buffer *tic;
-	struct pipe_buffer *tsc;
+	struct nouveau_resource *immd_heap[1];
+	struct nouveau_resource *parm_heap[2];
+
+	struct nouveau_bo *tic;
+	struct nouveau_bo *tsc;
 
 	struct nouveau_stateobj *static_init;
 };
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index ba852194cd..116866a8e7 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -136,9 +136,11 @@ static void *
 nv50_sampler_state_create(struct pipe_context *pipe,
 			  const struct pipe_sampler_state *cso)
 {
-	unsigned *tsc = CALLOC(8, sizeof(unsigned));
+	struct nv50_sampler_stateobj *sso = CALLOC(1, sizeof(*sso));
+	unsigned *tsc = sso->tsc;
+	float limit;
 
-	tsc[0] = (0x00024000 |
+	tsc[0] = (0x00026000 |
 		  (wrap_mode(cso->wrap_s) << 0) |
 		  (wrap_mode(cso->wrap_t) << 3) |
 		  (wrap_mode(cso->wrap_r) << 6));
@@ -202,7 +204,14 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7);
 	}
 
-	return (void *)tsc;
+	limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+	tsc[1] |= ((int)(limit * 256.0) & 0x1fff) << 11;
+
+	tsc[2] |= ((int)CLAMP(cso->max_lod, 0.0, 15.0) << 20) |
+		  ((int)CLAMP(cso->min_lod, 0.0, 15.0) << 8);
+
+	sso->normalized = cso->normalized_coords;
+	return (void *)sso;
 }
 
 static void
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index c13d3de1cb..d313e9de4f 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -32,6 +32,9 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	unsigned i, w, h, gw = 0;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
+		struct pipe_texture *pt = fb->cbufs[i]->texture;
+		struct nouveau_bo *bo = nv50_miptree(pt)->bo;
+
 		if (!gw) {
 			w = fb->cbufs[i]->width;
 			h = fb->cbufs[i]->height;
@@ -46,12 +49,10 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		so_data  (so, fb->cbufs[i]->height);
 
 		so_method(so, tesla, NV50TCL_RT_ADDRESS_HIGH(i), 5);
-		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
-			  NOUVEAU_BO_RDWR, 0, 0);
-		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
-			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->cbufs[i]->format) {
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
 			so_data(so, 0xcf);
@@ -65,7 +66,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, 0xe6);
 			break;
 		}
-		so_data(so, 0x00000000);
+		so_data(so, bo->tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1224, 1);
@@ -73,6 +74,9 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	}
 
 	if (fb->zsbuf) {
+		struct pipe_texture *pt = fb->zsbuf->texture;
+		struct nouveau_bo *bo = nv50_miptree(pt)->bo;
+
 		if (!gw) {
 			w = fb->zsbuf->width;
 			h = fb->zsbuf->height;
@@ -83,14 +87,13 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		}
 
 		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
-		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
-			  NOUVEAU_BO_RDWR, 0, 0);
-		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
-			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
-			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->zsbuf->format) {
 		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z24X8_UNORM:
 			so_data(so, 0x16);
 			break;
 		case PIPE_FORMAT_Z16_UNORM:
@@ -102,7 +105,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, 0x16);
 			break;
 		}
-		so_data(so, 0x00000000);
+		so_data(so, bo->tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1538, 1);
@@ -131,7 +134,7 @@ static void
 nv50_state_emit(struct nv50_context *nv50)
 {
 	struct nv50_screen *screen = nv50->screen;
-	struct nouveau_winsys *nvws = screen->nvws;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	if (nv50->pctx_id != screen->cur_pctx) {
 		nv50->state.dirty |= 0xffffffff;
@@ -139,40 +142,40 @@ nv50_state_emit(struct nv50_context *nv50)
 	}
 
 	if (nv50->state.dirty & NV50_NEW_FRAMEBUFFER)
-		so_emit(nvws, nv50->state.fb);
+		so_emit(chan, nv50->state.fb);
 	if (nv50->state.dirty & NV50_NEW_BLEND)
-		so_emit(nvws, nv50->state.blend);
+		so_emit(chan, nv50->state.blend);
 	if (nv50->state.dirty & NV50_NEW_ZSA)
-		so_emit(nvws, nv50->state.zsa);
+		so_emit(chan, nv50->state.zsa);
 	if (nv50->state.dirty & NV50_NEW_VERTPROG)
-		so_emit(nvws, nv50->state.vertprog);
+		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
-		so_emit(nvws, nv50->state.fragprog);
+		so_emit(chan, nv50->state.fragprog);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
-		so_emit(nvws, nv50->state.rast);
+		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
-		so_emit(nvws, nv50->state.blend_colour);
+		so_emit(chan, nv50->state.blend_colour);
 	if (nv50->state.dirty & NV50_NEW_STIPPLE)
-		so_emit(nvws, nv50->state.stipple);
+		so_emit(chan, nv50->state.stipple);
 	if (nv50->state.dirty & NV50_NEW_SCISSOR)
-		so_emit(nvws, nv50->state.scissor);
+		so_emit(chan, nv50->state.scissor);
 	if (nv50->state.dirty & NV50_NEW_VIEWPORT)
-		so_emit(nvws, nv50->state.viewport);
+		so_emit(chan, nv50->state.viewport);
 	if (nv50->state.dirty & NV50_NEW_SAMPLER)
-		so_emit(nvws, nv50->state.tsc_upload);
+		so_emit(chan, nv50->state.tsc_upload);
 	if (nv50->state.dirty & NV50_NEW_TEXTURE)
-		so_emit(nvws, nv50->state.tic_upload);
+		so_emit(chan, nv50->state.tic_upload);
 	if (nv50->state.dirty & NV50_NEW_ARRAYS) {
-		so_emit(nvws, nv50->state.vtxfmt);
-		so_emit(nvws, nv50->state.vtxbuf);
+		so_emit(chan, nv50->state.vtxfmt);
+		so_emit(chan, nv50->state.vtxbuf);
 	}
 	nv50->state.dirty = 0;
 
-	so_emit_reloc_markers(nvws, nv50->state.fb);
-	so_emit_reloc_markers(nvws, nv50->state.vertprog);
-	so_emit_reloc_markers(nvws, nv50->state.fragprog);
-	so_emit_reloc_markers(nvws, nv50->state.vtxbuf);
-	so_emit_reloc_markers(nvws, nv50->screen->static_init);
+	so_emit_reloc_markers(chan, nv50->state.fb);
+	so_emit_reloc_markers(chan, nv50->state.vertprog);
+	so_emit_reloc_markers(chan, nv50->state.fragprog);
+	so_emit_reloc_markers(chan, nv50->state.vtxbuf);
+	so_emit_reloc_markers(chan, nv50->screen->static_init);
 }
 
 boolean
@@ -293,12 +296,12 @@ viewport_uptodate:
 		so_data  (so, NV50_CB_TSC);
 		so_method(so, tesla, 0x40000f04, nv50->sampler_nr * 8);
 		for (i = 0; i < nv50->sampler_nr; i++)
-			so_datap (so, nv50->sampler[i], 8);
+			so_datap (so, nv50->sampler[i]->tsc, 8);
 		so_ref(so, &nv50->state.tsc_upload);
 		so_ref(NULL, &so);
 	}
 
-	if (nv50->dirty & NV50_NEW_TEXTURE)
+	if (nv50->dirty & (NV50_NEW_TEXTURE | NV50_NEW_SAMPLER))
 		nv50_tex_validate(nv50);
 
 	if (nv50->dirty & NV50_NEW_ARRAYS)
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index 0cc5168144..3da9d6e728 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -35,7 +35,6 @@ nv50_format(enum pipe_format format)
 {
 	switch (format) {
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
-	case PIPE_FORMAT_Z24S8_UNORM:
 		return NV50_2D_DST_FORMAT_32BPP;
 	case PIPE_FORMAT_X8R8G8B8_UNORM:
 		return NV50_2D_DST_FORMAT_24BPP;
@@ -52,21 +51,17 @@ static int
 nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
 {
 	struct nv50_miptree *mt = nv50_miptree(ps->texture);
-	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
-	struct nouveau_bo *bo;
+	struct nouveau_bo *bo = nv50_miptree(ps->texture)->bo;
  	int format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
  	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
- 
-	bo = screen->nvws->get_bo(nv50_miptree(ps->texture)->buffer);
-	if (!bo)
-		return 1;
 
  	format = nv50_format(ps->format);
  	if (format < 0)
  		return 1;
   
- 	if (!bo->tiled) {
+ 	if (!bo->tile_flags) {
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
@@ -80,7 +75,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		BEGIN_RING(chan, eng2d, mthd, 5);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 0);
- 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, bo->tile_mode << 4);
  		OUT_RING  (chan, 1);
  		OUT_RING  (chan, 0);
  		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
@@ -108,7 +103,7 @@ nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
 		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
 		     int w, int h)
 {
-	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
 	int ret;
 
@@ -165,7 +160,7 @@ nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
 {
 	struct nv50_context *nv50 = (struct nv50_context *)pipe;
 	struct nv50_screen *screen = nv50->screen;
-	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_channel *chan = screen->eng2d->channel;
 	struct nouveau_grobj *eng2d = screen->eng2d;
 	int format, ret;
 
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index 223c8a3a45..ff40c2ad81 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -26,7 +26,8 @@
 #include "nouveau/nouveau_stateobj.h"
 
 static int
-nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
+nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
+		   struct nv50_miptree *mt, int unit)
 {
 	switch (mt->base.format) {
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
@@ -117,15 +118,18 @@ nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
 		return 1;
 	}
 
-	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+	so_reloc(so, mt->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
 		     NOUVEAU_BO_RD, 0, 0);
-	so_data (so, 0xd0005000);
+	if (nv50->sampler[unit]->normalized)
+		so_data (so, 0xd0005000 | mt->bo->tile_mode << 22);
+	else
+		so_data (so, 0x5001d000 | mt->bo->tile_mode << 22);
 	so_data (so, 0x00300000);
 	so_data (so, mt->base.width[0]);
-	so_data (so, (mt->base.depth[0] << 16) | mt->base.height[0]);
+	so_data (so, (mt->base.last_level << 28) |
+		     (mt->base.depth[0] << 16) | mt->base.height[0]);
 	so_data (so, 0x03000000);
-	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
-		     NOUVEAU_BO_RD, 0, 0);
+	so_data (so, mt->base.last_level << 4);
 
 	return 0;
 }
@@ -135,23 +139,35 @@ nv50_tex_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nouveau_stateobj *so;
-	int unit;
+	int unit, push;
+
+	push  = nv50->miptree_nr * 9 + 2;
+	push += MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2;
 
-	so = so_new(nv50->miptree_nr * 8 + 3, nv50->miptree_nr * 2);
+	so = so_new(push, nv50->miptree_nr * 2);
 	so_method(so, tesla, 0x0f00, 1);
 	so_data  (so, NV50_CB_TIC);
-	so_method(so, tesla, 0x40000f04, nv50->miptree_nr * 8);
 	for (unit = 0; unit < nv50->miptree_nr; unit++) {
 		struct nv50_miptree *mt = nv50->miptree[unit];
 
-		if (nv50_tex_construct(so, mt)) {
+		so_method(so, tesla, 0x40000f04, 8);
+		if (nv50_tex_construct(nv50, so, mt, unit)) {
 			NOUVEAU_ERR("failed tex validate\n");
 			so_ref(NULL, &so);
 			return;
 		}
+
+		so_method(so, tesla, 0x1458, 1);
+		so_data  (so, (unit << 9) | (unit << 1) | 1);
+	}
+
+	for (; unit < nv50->state.miptree_nr; unit++) {
+		so_method(so, tesla, 0x1458, 1);
+		so_data  (so, (unit << 1) | 0);
 	}
 
 	so_ref(so, &nv50->state.tic_upload);
 	so_ref(NULL, &so);
+	nv50->state.miptree_nr = nv50->miptree_nr;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index 747195b4f6..d0b7f0bef4 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -6,8 +6,8 @@
 
 struct nv50_transfer {
 	struct pipe_transfer base;
-	struct pipe_buffer *buffer;
-	struct nv50_miptree_level *level;
+	struct nouveau_bo *bo;
+	unsigned level_offset;
 	int level_pitch;
 	int level_width;
 	int level_height;
@@ -16,51 +16,48 @@ struct nv50_transfer {
 };
 
 static void
-nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct pipe_buffer *src,
-			int src_pitch, int sx, int sy, int sw, int sh,
-			struct pipe_buffer *dst, int dst_pitch, int dx, int dy,
+nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo,
+			unsigned src_offset, int src_pitch, int sx, int sy,
+			int sw, int sh, struct nouveau_bo *dst_bo,
+			unsigned dst_offset, int dst_pitch, int dx, int dy,
 			int dw, int dh, int cpp, int width, int height,
 			unsigned src_reloc, unsigned dst_reloc)
 {
 	struct nv50_screen *screen = nv50_screen(pscreen);
-	struct nouveau_winsys *nvws = screen->nvws;
-	struct nouveau_channel *chan = nvws->channel;
+	struct nouveau_channel *chan = screen->m2mf->channel;
 	struct nouveau_grobj *m2mf = screen->m2mf;
-	struct nouveau_bo *src_bo = nvws->get_bo(src);
-	struct nouveau_bo *dst_bo = nvws->get_bo(dst);
-	unsigned src_offset = 0, dst_offset = 0;
 
 	src_reloc |= NOUVEAU_BO_RD;
 	dst_reloc |= NOUVEAU_BO_WR;
 
 	WAIT_RING (chan, 14);
 
-	if (!src_bo->tiled) {
+	if (!src_bo->tile_flags) {
 		BEGIN_RING(chan, m2mf, 0x0200, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf, 0x0314, 1);
 		OUT_RING  (chan, src_pitch);
-		src_offset = (sy * src_pitch) + (sx * cpp);
+		src_offset += (sy * src_pitch) + (sx * cpp);
 	} else {
 		BEGIN_RING(chan, m2mf, 0x0200, 6);
 		OUT_RING  (chan, 0);
-		OUT_RING  (chan, 0);
+		OUT_RING  (chan, src_bo->tile_mode << 4);
 		OUT_RING  (chan, sw * cpp);
 		OUT_RING  (chan, sh);
 		OUT_RING  (chan, 1);
 		OUT_RING  (chan, 0);
 	}
 
-	if (!dst_bo->tiled) {
+	if (!dst_bo->tile_flags) {
 		BEGIN_RING(chan, m2mf, 0x021c, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf, 0x0318, 1);
 		OUT_RING  (chan, dst_pitch);
-		dst_offset = (dy * dst_pitch) + (dx * cpp);
+		dst_offset += (dy * dst_pitch) + (dx * cpp);
 	} else {
 		BEGIN_RING(chan, m2mf, 0x021c, 6);
 		OUT_RING  (chan, 0);
-		OUT_RING  (chan, 0);
+		OUT_RING  (chan, dst_bo->tile_mode << 4);
 		OUT_RING  (chan, dw * cpp);
 		OUT_RING  (chan, dh);
 		OUT_RING  (chan, 1);
@@ -77,13 +74,13 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct pipe_buffer *src,
 		BEGIN_RING(chan, m2mf, 0x030c, 2);
 		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
-		if (src_bo->tiled) {
+		if (src_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf, 0x0218, 1);
 			OUT_RING  (chan, (dy << 16) | sx);
 		} else {
 			src_offset += (line_count * src_pitch);
 		}
-		if (dst_bo->tiled) {
+		if (dst_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf, 0x0234, 1);
 			OUT_RING  (chan, (sy << 16) | dx);
 		} else {
@@ -108,10 +105,12 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		  enum pipe_transfer_usage usage,
 		  unsigned x, unsigned y, unsigned w, unsigned h)
 {
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
 	struct nv50_miptree *mt = nv50_miptree(pt);
 	struct nv50_miptree_level *lvl = &mt->level[level];
 	struct nv50_transfer *tx;
 	unsigned image = 0;
+	int ret;
 
 	if (pt->target == PIPE_TEXTURE_CUBE)
 		image = face;
@@ -133,20 +132,24 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	tx->base.stride = (w * pt->block.size);
 	tx->base.usage = usage;
 
-	tx->level = lvl;
 	tx->level_pitch = lvl->pitch;
 	tx->level_width = mt->base.width[level];
 	tx->level_height = mt->base.height[level];
+	tx->level_offset = lvl->image_offset[image];
 	tx->level_x = x;
 	tx->level_y = y;
-	tx->buffer =
-		pipe_buffer_create(pscreen, 0, NOUVEAU_BUFFER_USAGE_TRANSFER,
-				   w * tx->base.block.size * h);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
+			     w * pt->block.size * h, &tx->bo);
+	if (ret) {
+		FREE(tx);
+		return NULL;
+	}
 
 	if (usage != PIPE_TRANSFER_WRITE) {
-		nv50_transfer_rect_m2mf(pscreen, mt->buffer, tx->level_pitch,
-					x, y, tx->level_width, tx->level_height,
-					tx->buffer, tx->base.stride, 0, 0,
+		nv50_transfer_rect_m2mf(pscreen, mt->bo, tx->level_offset,
+					tx->level_pitch, x, y, tx->level_width,
+					tx->level_height, tx->bo, 0,
+					tx->base.stride, 0, 0,
 					tx->base.width, tx->base.height,
 					tx->base.block.size, w, h,
 					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART,
@@ -164,17 +167,18 @@ nv50_transfer_del(struct pipe_transfer *ptx)
 
 	if (ptx->usage != PIPE_TRANSFER_READ) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
-		nv50_transfer_rect_m2mf(pscreen, tx->buffer, tx->base.stride,
+		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, tx->base.stride,
 					0, 0, tx->base.width, tx->base.height,
-					mt->buffer, tx->level_pitch,
-					tx->level_x, tx->level_y,
-					tx->level_width, tx->level_height,
-					tx->base.block.size, tx->base.width,
-					tx->base.height, NOUVEAU_BO_GART,
-					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART);
+					mt->bo, tx->level_offset,
+					tx->level_pitch, tx->level_x,
+					tx->level_y, tx->level_width,
+					tx->level_height, tx->base.block.size,
+					tx->base.width, tx->base.height,
+					NOUVEAU_BO_GART, NOUVEAU_BO_VRAM |
+					NOUVEAU_BO_GART);
 	}
 
-	pipe_buffer_reference(&tx->buffer, NULL);
+	nouveau_bo_ref(NULL, &tx->bo);
 	pipe_texture_reference(&ptx->texture, NULL);
 	FREE(ptx);
 }
@@ -184,13 +188,17 @@ nv50_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 {
 	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
 	unsigned flags = 0;
+	int ret;
 
 	if (ptx->usage & PIPE_TRANSFER_WRITE)
-		flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
+		flags |= NOUVEAU_BO_WR;
 	if (ptx->usage & PIPE_TRANSFER_READ)
-		flags |= PIPE_BUFFER_USAGE_CPU_READ;
+		flags |= NOUVEAU_BO_RD;
 
-	return pipe_buffer_map(pscreen, tx->buffer, flags);
+	ret = nouveau_bo_map(tx->bo, flags);
+	if (ret)
+		return NULL;
+	return tx->bo->map;
 }
 
 static void
@@ -198,7 +206,7 @@ nv50_transfer_unmap(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 {
 	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
 
-	pipe_buffer_unmap(pscreen, tx->buffer);
+	nouveau_bo_unmap(tx->bo);
 }
 
 void
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 0749c90691..f81929f238 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -22,6 +22,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
 
 #include "nv50_context.h"
 
@@ -53,7 +54,7 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 		 unsigned count)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	nv50_state_validate(nv50);
@@ -83,7 +84,7 @@ static INLINE void
 nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 			      unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	map += start;
@@ -112,7 +113,7 @@ static INLINE void
 nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 			      unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	map += start;
@@ -141,7 +142,7 @@ static INLINE void
 nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint8_t *map,
 			      unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 
 	map += start;
@@ -163,10 +164,12 @@ nv50_draw_elements(struct pipe_context *pipe,
 		   unsigned mode, unsigned start, unsigned count)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct pipe_winsys *ws = pipe->winsys;
-	void *map = ws->buffer_map(ws, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+	struct pipe_screen *pscreen = pipe->screen;
+	void *map;
+	
+	map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
 
 	nv50_state_validate(nv50);
 
@@ -193,6 +196,7 @@ nv50_draw_elements(struct pipe_context *pipe,
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
+	pipe_buffer_unmap(pscreen, indexBuffer);
 	pipe->flush(pipe, 0, NULL);
 	return TRUE;
 }
@@ -212,6 +216,7 @@ nv50_vbo_validate(struct nv50_context *nv50)
 		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
 		struct pipe_vertex_buffer *vb =
 			&nv50->vtxbuf[ve->vertex_buffer_index];
+		struct nouveau_bo *bo = nouveau_bo(vb->buffer);
 
 		switch (ve->src_format) {
 		case PIPE_FORMAT_R32G32B32A32_FLOAT:
@@ -240,10 +245,10 @@ nv50_vbo_validate(struct nv50_context *nv50)
 
 		so_method(vtxbuf, tesla, 0x900 + (i * 16), 3);
 		so_data  (vtxbuf, 0x20000000 | vb->stride);
-		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	}