1 files changed, 511 insertions, 249 deletions
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index f0fe7e6168..679c28ce4b 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -98,9 +98,17 @@ struct nv50_reg {
 #define NV50_MOD_ABS 2
 #define NV50_MOD_SAT 4
 
-/* arbitrary limits */
-#define MAX_IF_DEPTH 4
-#define MAX_LOOP_DEPTH 4
+/* STACK: Conditionals and loops have to use the (per warp) stack.
+ * Stack entries consist of an entry type (divergent path, join at),
+ * a mask indicating the active threads of the warp, and an address.
+ * MPs can store 12 stack entries internally, if we need more (and
+ * we probably do), we have to create a stack buffer in VRAM.
+ */
+/* impose low limits for now */
+#define NV50_MAX_COND_NESTING 4
+#define NV50_MAX_LOOP_NESTING 3
+
+#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2
 
 struct nv50_pc {
 	struct nv50_program *p;
@@ -119,10 +127,11 @@ struct nv50_pc {
 	struct nv50_reg *param;
 	int param_nr;
 	struct nv50_reg *immd;
-	float *immd_buf;
+	uint32_t *immd_buf;
 	int immd_nr;
 	struct nv50_reg **addr;
 	int addr_nr;
+	uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */
 
 	struct nv50_reg *temp_temp[16];
 	unsigned temp_temp_nr;
@@ -131,17 +140,19 @@ struct nv50_pc {
 	struct nv50_reg *r_brdc;
 	struct nv50_reg *r_dst[4];
 
+	struct nv50_reg reg_instances[16];
+	unsigned reg_instance_nr;
+
 	unsigned interp_mode[32];
 	/* perspective interpolation registers */
 	struct nv50_reg *iv_p;
 	struct nv50_reg *iv_c;
 
-	struct nv50_program_exec *if_cond;
-	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
-	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
-	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
+	struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING];
+	struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING];
+	struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING];
 	int if_lvl, loop_lvl;
-	unsigned loop_pos[MAX_LOOP_DEPTH];
+	unsigned loop_pos[NV50_MAX_LOOP_NESTING];
 
 	/* current instruction and total number of insns */
 	unsigned insn_cur;
@@ -150,6 +161,20 @@ struct nv50_pc {
 	boolean allow32;
 };
 
+static INLINE struct nv50_reg *
+reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *ri;
+
+	assert(pc->reg_instance_nr < 16);
+	ri = &pc->reg_instances[pc->reg_instance_nr++];
+	if (reg) {
+		*ri = *reg;
+		reg->mod = 0;
+	}
+	return ri;
+}
+
 static INLINE void
 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
 {
@@ -176,8 +201,7 @@ terminate_mbb(struct nv50_pc *pc)
 
 	/* remove records of temporary address register values */
 	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
-		if (pc->r_addr[i].index < 0)
-			pc->r_addr[i].rhw = -1;
+		pc->r_addr[i].rhw = -1;
 }
 
 static void
@@ -342,25 +366,34 @@ static void
 kill_temp_temp(struct nv50_pc *pc)
 {
 	int i;
-	
+
 	for (i = 0; i < pc->temp_temp_nr; i++)
 		free_temp(pc, pc->temp_temp[i]);
 	pc->temp_temp_nr = 0;
 }
 
 static int
-ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
+ctor_immd_4u32(struct nv50_pc *pc,
+	       uint32_t x, uint32_t y, uint32_t z, uint32_t w)
 {
-	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
-			       (pc->immd_nr + 1) * 4 * sizeof(float));
+	unsigned size = pc->immd_nr * 4 * sizeof(uint32_t);
+
+	pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t));
+
 	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
 	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
 	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
 	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
-	
+
 	return pc->immd_nr++;
 }
 
+static INLINE int
+ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+	return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w));
+}
+
 static struct nv50_reg *
 alloc_immd(struct nv50_pc *pc, float f)
 {
@@ -368,11 +401,11 @@ alloc_immd(struct nv50_pc *pc, float f)
 	unsigned hw;
 
 	for (hw = 0; hw < pc->immd_nr * 4; hw++)
-		if (pc->immd_buf[hw] == f)
+		if (pc->immd_buf[hw] == fui(f))
 			break;
 
 	if (hw == pc->immd_nr * 4)
-		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
+		hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4;
 
 	ctor_reg(r, P_IMMD, -1, hw);
 	return r;
@@ -464,22 +497,24 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
-	unsigned val;
-	float f = pc->immd_buf[imm->hw];
+	union {
+		float f;
+		uint32_t ui;
+	} u;
+	u.ui = pc->immd_buf[imm->hw];
 
-	if (imm->mod & NV50_MOD_ABS)
-		f = fabsf(f);
-	val = fui((imm->mod & NV50_MOD_NEG) ? -f : f);
+	u.f = (imm->mod & NV50_MOD_ABS) ? fabsf(u.f) : u.f;
+	u.f = (imm->mod & NV50_MOD_NEG) ? -u.f : u.f;
 
 	set_long(pc, e);
-	/*XXX: can't be predicated - bits overlap.. catch cases where both
-	 *     are required and avoid them. */
+	/* XXX: can't be predicated - bits overlap; cases where both
+	 * are required should be avoided by using pc->allow32 */
 	set_pred(pc, 0, 0, e);
 	set_pred_wr(pc, 0, 0, e);
 
 	e->inst[1] |= 0x00000002 | 0x00000001;
-	e->inst[0] |= (val & 0x3f) << 16;
-	e->inst[1] |= (val >> 6) << 2;
+	e->inst[0] |= (u.ui & 0x3f) << 16;
+	e->inst[1] |= (u.ui >> 6) << 2;
 }
 
 static INLINE void
@@ -511,21 +546,24 @@ emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
 static struct nv50_reg *
 alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
 {
-	int i;
 	struct nv50_reg *a_tgsi = NULL, *a = NULL;
+	int i;
+	uint8_t avail = ~pc->addr_alloc;
 
 	if (!ref) {
-		/* allocate for TGSI address reg */
-		for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
-			if (pc->r_addr[i].index >= 0)
-				continue;
-			if (pc->r_addr[i].rhw >= 0 &&
-			    pc->r_addr[i].acc == pc->insn_cur)
-				continue;
+		/* allocate for TGSI_FILE_ADDRESS */
+		while (avail) {
+			i = ffs(avail) - 1;
 
-			pc->r_addr[i].rhw = -1;
-			pc->r_addr[i].index = i;
-			return &pc->r_addr[i];
+			if (pc->r_addr[i].rhw < 0 ||
+			    pc->r_addr[i].acc != pc->insn_cur) {
+				pc->addr_alloc |= (1 << i);
+
+				pc->r_addr[i].rhw = -1;
+				pc->r_addr[i].index = i;
+				return &pc->r_addr[i];
+			}
+			avail &= ~(1 << i);
 		}
 		assert(0);
 		return NULL;
@@ -533,15 +571,16 @@ alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
 
 	/* Allocate and set an address reg so we can access 'ref'.
 	 *
-	 * If and r_addr has index < 0, it is not reserved for TGSI,
-	 * and index will be the negative of the TGSI addr index the
-	 * value in rhw is relative to, or -256 if rhw is an offset
-	 * from 0. If rhw < 0, the reg has not been initialized.
+	 * If and r_addr->index will be -1 or the hw index the value
+	 * value in rhw is relative to. If rhw < 0, the reg has not
+	 * been initialized or is in use for TGSI_FILE_ADDRESS.
 	 */
-	for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
-		if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
-			continue;
-		if (pc->r_addr[i].rhw < 0) { /* unused */
+	while (avail) { /* only consider regs that are not TGSI */
+		i = ffs(avail) - 1;
+		avail &= ~(1 << i);
+
+		if ((!a || a->rhw >= 0) && pc->r_addr[i].rhw < 0) {
+			/* prefer an usused reg with low hw index */
 			a = &pc->r_addr[i];
 			continue;
 		}
@@ -551,8 +590,8 @@ alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
 		if (ref->hw - pc->r_addr[i].rhw >= 128)
 			continue;
 
-		if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
-		    (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
+		if ((ref->acc >= 0 && pc->r_addr[i].index < 0) ||
+		    (ref->acc < 0 && pc->r_addr[i].index == ref->index)) {
 			pc->r_addr[i].acc = pc->insn_cur;
 			return &pc->r_addr[i];
 		}
@@ -566,7 +605,7 @@ alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
 
 	a->rhw = ref->hw & ~0x7f;
 	a->acc = pc->insn_cur;
-	a->index = a_tgsi ? -ref->index : -256;
+	a->index = a_tgsi ? ref->index : -1;
 	return a;
 }
 
@@ -644,7 +683,7 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	if (src->type == P_IMMD || src->type == P_CONST) {
 		set_long(pc, e);
 		set_data(pc, src, 0x7f, 9, e);
-		e->inst[1] |= 0x20000000; /* src0 const? */
+		e->inst[1] |= 0x20000000; /* mov from c[] */
 	} else {
 		if (src->type == P_ATTR) {
 			set_long(pc, e);
@@ -659,9 +698,9 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
 	if (is_long(e) && !is_immd(e)) {
 		e->inst[1] |= 0x04000000; /* 32-bit */
-		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
+		e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */
 		if (!(e->inst[1] & 0x20000000))
-			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
+			e->inst[1] |= 0x00030000; /* lane mask 2:3 */
 	} else
 		e->inst[0] |= 0x00008000;
 
@@ -676,6 +715,17 @@ emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
 	FREE(imm);
 }
 
+static void
+emit_nop(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000;
+	set_long(pc, e);
+	e->inst[1] = 0xe0000000;
+	emit(pc, e);
+}
+
 static boolean
 check_swap_src_0_1(struct nv50_pc *pc,
 		   struct nv50_reg **s0, struct nv50_reg **s1)
@@ -795,6 +845,33 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 }
 
 static void
+emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	assert(dst->type == P_TEMP);
+	e->inst[1] = 0x20000000 | (pred << 12);
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x000001fc;
+	e->inst[1] = 0xa0000008;
+	set_long(pc, e);
+	set_pred_wr(pc, 1, pred, e);
+	set_src_0_restricted(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1)
 {
@@ -898,7 +975,6 @@ static INLINE void
 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1)
 {
-	assert(src0 != src1);
 	src1->mod ^= NV50_MOD_NEG;
 	emit_add(pc, dst, src0, src1);
 	src1->mod ^= NV50_MOD_NEG;
@@ -967,7 +1043,6 @@ static INLINE void
 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
-	assert(src2 != src0 && src2 != src1);
 	src2->mod ^= NV50_MOD_NEG;
 	emit_mad(pc, dst, src0, src1, src2);
 	src2->mod ^= NV50_MOD_NEG;
@@ -1120,7 +1195,6 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	set_src_1(pc, src1, e);
 
 	emit(pc, e);
-	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
 
 	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
 	if (rdst)
@@ -1242,24 +1316,128 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 {
 	struct nv50_program_exec *e;
 	const int r_pred = 1;
-	unsigned cvn = CVT_F32_F32;
-
-	if (src->mod & NV50_MOD_NEG)
-		cvn |= CVT_NEG;
-	/* write predicate reg */
-	emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
 
-	/* conditional discard */
 	e = exec(pc);
-	e->inst[0] = 0x00000002;
+	e->inst[0] = 0x00000002; /* discard */
+	set_long(pc, e); /* sets cond code to ALWAYS */
+
+	if (src) {
+		unsigned cvn = CVT_F32_F32;
+
+		set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
+
+		if (src->mod & NV50_MOD_NEG)
+			cvn |= CVT_NEG;
+		/* write predicate reg */
+		emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
+	}
+
+	emit(pc, e);
+}
+
+static struct nv50_program_exec *
+emit_breakaddr(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x40000002;
+	set_long(pc, e);
+
+	emit(pc, e);
+	return e;
+}
+
+static void
+emit_break(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x50000002;
+	set_long(pc, e);
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
+
+	emit(pc, e);
+}
+
+static struct nv50_program_exec *
+emit_joinat(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000002;
+	set_long(pc, e);
+
+	emit(pc, e);
+	return e;
+}
+
+static struct nv50_program_exec *
+emit_branch(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x10000002;
 	set_long(pc, e);
-	set_pred(pc, 0x1 /* LT */, r_pred, e);
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
 	emit(pc, e);
+	return pc->p->exec_tail;
+}
+
+static void
+emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000002;
+	set_long(pc, e);
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
+
+	emit(pc, e);
+}
+
+#define QOP_ADD 0
+#define QOP_SUBR 1
+#define QOP_SUB 2
+#define QOP_MOV_SRC1 3
+
+/* For a quad of threads / top left, top right, bottom left, bottom right
+ * pixels, do a different operation, and take src0 from a specific thread.
+ */
+static void
+emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0,
+	    struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop)
+{
+       struct nv50_program_exec *e = exec(pc);
+
+       e->inst[0] = 0xc0000000;
+       e->inst[1] = 0x80000000;
+       set_long(pc, e);
+       e->inst[0] |= lane_src0 << 16;
+       set_src_0(pc, src0, e);
+       set_src_2(pc, src1, e);
+
+       if (wp >= 0)
+	       set_pred_wr(pc, 1, wp, e);
+
+       if (dst)
+	       set_dst(pc, dst, e);
+       else {
+	       e->inst[0] |= 0x000001fc;
+	       e->inst[1] |= 0x00000008;
+       }
+
+       e->inst[0] |= (qop & 3) << 20;
+       e->inst[1] |= (qop >> 2) << 22;
+
+       emit(pc, e);
 }
 
 static void
 load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
-		     struct nv50_reg **src, boolean proj)
+		     struct nv50_reg **src, unsigned arg, boolean proj)
 {
 	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
 
@@ -1276,6 +1454,10 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 
 	if (proj && 0 /* looks more correct without this */)
 		emit_mul(pc, t[2], t[2], src[3]);
+	else
+	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
+		emit_mov(pc, t[3], src[3]);
+
 	emit_flop(pc, 0, t[2], t[2]);
 
 	emit_mul(pc, t[0], src[0], t[2]);
@@ -1284,89 +1466,214 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 }
 
 static void
-emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
-	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
+load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
+		     struct nv50_reg **src, unsigned dim, unsigned arg)
 {
-	struct nv50_reg *t[4];
-	struct nv50_program_exec *e;
+	unsigned c, mode;
+
+	if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+		mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE;
 
-	unsigned c, mode, dim;
+		t[3]->rhw = src[3]->rhw;
+		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+		emit_flop(pc, 0, t[3], t[3]);
 
+		for (c = 0; c < dim; ++c) {
+			t[c]->rhw = src[c]->rhw;
+			emit_interp(pc, t[c], t[3], mode);
+		}
+		if (arg != dim) { /* depth reference value */
+			t[dim]->rhw = src[2]->rhw;
+			emit_interp(pc, t[dim], t[3], mode);
+		}
+	} else {
+		/* XXX: for some reason the blob sometimes uses MAD
+		 * (mad f32 $rX $rY $rZ neg $r63)
+		 */
+		emit_flop(pc, 0, t[3], src[3]);
+		for (c = 0; c < dim; ++c)
+			emit_mul(pc, t[c], src[c], t[3]);
+		if (arg != dim) /* depth reference value */
+			emit_mul(pc, t[dim], src[2], t[3]);
+	}
+}
+
+static INLINE void
+get_tex_dim(unsigned type, unsigned *dim, unsigned *arg)
+{
 	switch (type) {
 	case TGSI_TEXTURE_1D:
-		dim = 1;
+		*arg = *dim = 1;
+		break;
+	case TGSI_TEXTURE_SHADOW1D:
+		*dim = 1;
+		*arg = 2;
 		break;
 	case TGSI_TEXTURE_UNKNOWN:
 	case TGSI_TEXTURE_2D:
-	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
 	case TGSI_TEXTURE_RECT:
-		dim = 2;
+		*arg = *dim = 2;
+		break;
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOWRECT:
+		*dim = 2;
+		*arg = 3;
 		break;
 	case TGSI_TEXTURE_3D:
 	case TGSI_TEXTURE_CUBE:
-	case TGSI_TEXTURE_SHADOW2D:
-	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
-		dim = 3;
+		*dim = *arg = 3;
 		break;
 	default:
 		assert(0);
 		break;
 	}
+}
 
-	/* some cards need t[0]'s hw index to be a multiple of 4 */
-	alloc_temp4(pc, t, 0);
+/* We shouldn't execute TEXLOD if any of the pixels in a quad have
+ * different LOD values, so branch off groups of equal LOD.
+ */
+static void
+emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
+		     struct nv50_reg *src, struct nv50_program_exec *tex)
+{
+	struct nv50_program_exec *join_at;
+	unsigned i, target = pc->p->exec_size + 7 * 2;
 
-	if (type == TGSI_TEXTURE_CUBE) {
-		load_cube_tex_coords(pc, t, src, proj);
-	} else
-	if (proj) {
-		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
-			mode = pc->interp_mode[src[0]->index];
-
-			t[3]->rhw = src[3]->rhw;
-			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
-			emit_flop(pc, 0, t[3], t[3]);
-
-			for (c = 0; c < dim; c++) {
-				t[c]->rhw = src[c]->rhw;
-				emit_interp(pc, t[c], t[3],
-					    (mode | INTERP_PERSPECTIVE));
-			}
-		} else {
-			emit_flop(pc, 0, t[3], src[3]);
-			for (c = 0; c < dim; c++)
-				emit_mul(pc, t[c], src[c], t[3]);
+	/* Subtract lod of each pixel from lod of top left pixel, jump
+	 * texlod insn if result is 0, then repeat for 2 other pixels.
+	 */
+	join_at = emit_joinat(pc);
+	emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
+	emit_branch(pc, 0, 2)->param.index = target;
 
-			/* XXX: for some reason the blob sometimes uses MAD:
-			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
-			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
-			 */
+	for (i = 1; i < 4; ++i) {
+		emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
+		emit_branch(pc, 0, 2)->param.index = target;
+	}
+
+	emit_mov(pc, tlod, src); /* target */
+	emit(pc, tex); /* texlod */
+
+	join_at->param.index = target + 2 * 2;
+	JOIN_ON(emit_nop(pc)); /* join _after_ tex */
+}
+
+static void
+emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
+		      struct nv50_program_exec *tex)
+{
+	struct nv50_program_exec *e;
+	struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
+	int r_pred = 0;
+	unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
+
+	pc->allow32 = FALSE;
+	ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
+
+	/* Subtract bias value of thread i from bias values of each thread,
+	 * store result in r_pred, and set bit i in r_bits if result was 0.
+	 */
+	assert(arg < 4);
+	for (i = 0; i < 4; ++i, ++imm_1248.hw) {
+		emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
+		emit_mov(pc, r_bits, &imm_1248);
+		set_pred(pc, 2, r_pred, pc->p->exec_tail);
+	}
+	emit_mov_to_pred(pc, r_pred, r_bits);
+
+	/* The lanes of a quad are now grouped by the bit in r_pred they have
+	 * set. Put the input values for TEX into a new register set for each
+	 * group and execute TEX only for a specific group.
+	 * We cannot use the same register set for each group because we need
+	 * the derivatives, which are implicitly calculated, to be correct.
+	 */
+	for (i = 1; i < 4; ++i) {
+		alloc_temp4(pc, t123[i], 0);
+
+		for (c = 0; c <= arg; ++c)
+			emit_mov(pc, t123[i][c], t[c]);
+
+		*(e = exec(pc)) = *(tex);
+		e->inst[0] &= ~0x01fc;
+		set_dst(pc, t123[i][0], e);
+		set_pred(pc, cc[i], r_pred, e);
+		emit(pc, e);
+	}
+	/* finally TEX on the original regs (where we kept the input) */
+	set_pred(pc, cc[0], r_pred, tex);
+	emit(pc, tex);
+
+	/* put the 3 * n other results into regs for lane 0 */
+	n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
+	for (i = 1; i < 4; ++i) {
+		for (c = 0; c < n; ++c) {
+			emit_mov(pc, t[c], t123[i][c]);
+			set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
 		}
-	} else {
-		for (c = 0; c < dim; c++)
-			emit_mov(pc, t[c], src[c]);
+		free_temp4(pc, t123[i]);
 	}
 
+	emit_nop(pc);
+	free_temp(pc, r_bits);
+}
+
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src, unsigned unit, unsigned type,
+	 boolean proj, int bias_lod)
+{
+	struct nv50_reg *t[4];
+	struct nv50_program_exec *e;
+	unsigned c, dim, arg;
+
+	/* t[i] must be within a single 128 bit super-reg */
+	alloc_temp4(pc, t, 0);
+
 	e = exec(pc);
+	e->inst[0] = 0xf0000000;
 	set_long(pc, e);
-	e->inst[0] |= 0xf0000000;
-	e->inst[1] |= 0x00000004;
 	set_dst(pc, t[0], e);
-	e->inst[0] |= (unit << 9);
 
-	if (dim == 2)
-		e->inst[0] |= 0x00400000;
-	else
-	if (dim == 3) {
-		e->inst[0] |= 0x00800000;
-		if (type == TGSI_TEXTURE_CUBE)
-			e->inst[0] |= 0x08000000;
+	/* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */
+	e->inst[0] |= (unit << 9) /* | (unit << 17) */;
+
+	/* live flag (don't set if TEX results affect input to another TEX): */
+	/* e->inst[0] |= 0x00000004; */
+
+	get_tex_dim(type, &dim, &arg);
+
+	if (type == TGSI_TEXTURE_CUBE) {
+		e->inst[0] |= 0x08000000;
+		load_cube_tex_coords(pc, t, src, arg, proj);
+	} else
+	if (proj)
+		load_proj_tex_coords(pc, t, src, dim, arg);
+	else {
+		for (c = 0; c < dim; c++)
+			emit_mov(pc, t[c], src[c]);
+		if (arg != dim) /* depth reference value (always src.z here) */
+			emit_mov(pc, t[dim], src[2]);
 	}
 
 	e->inst[0] |= (mask & 0x3) << 25;
 	e->inst[1] |= (mask & 0xc) << 12;
 
-	emit(pc, e);
+	if (!bias_lod) {
+		e->inst[0] |= (arg - 1) << 22;
+		emit(pc, e);
+	} else
+	if (bias_lod < 0) {
+		e->inst[0] |= arg << 22;
+		e->inst[1] |= 0x20000000; /* texbias */
+		emit_mov(pc, t[arg], src[3]);
+		emit_texbias_sequence(pc, t, arg, e);
+	} else {
+		e->inst[0] |= arg << 22;
+		e->inst[1] |= 0x40000000; /* texlod */
+		emit_mov(pc, t[arg], src[3]);
+		emit_texlod_sequence(pc, t[arg], src[3], e);
+	}
+
 #if 1
 	c = 0;
 	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
@@ -1389,46 +1696,14 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 }
 
 static void
-emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
-	    struct nv50_program_exec **join)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	if (join) {
-		set_long(pc, e);
-		e->inst[0] |= 0xa0000002;
-		emit(pc, e);
-		*join = e;
-		e = exec(pc);
-	}
-
-	set_long(pc, e);
-	e->inst[0] |= 0x10000002;
-	if (pred >= 0)
-		set_pred(pc, cc, pred, e);
-	emit(pc, e);
-}
-
-static void
-emit_nop(struct nv50_pc *pc)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xf0000000;
-	set_long(pc, e);
-	e->inst[1] = 0xe0000000;
-	emit(pc, e);
-}
-
-static void
 emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
 	struct nv50_program_exec *e = exec(pc);
 
 	assert(src->type == P_TEMP);
 
-	e->inst[0] = 0xc0140000;
-	e->inst[1] = 0x89800000;
+	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000;
+	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000;
 	set_long(pc, e);
 	set_dst(pc, dst, e);
 	set_src_0(pc, src, e);
@@ -1440,25 +1715,16 @@ emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 static void
 emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	struct nv50_reg *r = src;
 	struct nv50_program_exec *e = exec(pc);
 
 	assert(src->type == P_TEMP);
 
-	if (!(src->mod & NV50_MOD_NEG)) { /* ! double negation */
-		r = alloc_temp(pc, NULL);
-		emit_neg(pc, r, src);
-	}
-
-	e->inst[0] = 0xc0150000;
-	e->inst[1] = 0x8a400000;
+	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000;
+	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000;
 	set_long(pc, e);
 	set_dst(pc, dst, e);
-	set_src_0(pc, r, e);
-	set_src_2(pc, r, e);
-
-	if (r != src)
-		free_temp(pc, r);
+	set_src_0(pc, src, e);
+	set_src_2(pc, src, e);
 
 	emit(pc, e);
 }
@@ -1515,9 +1781,8 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 static boolean
 negate_supported(const struct tgsi_full_instruction *insn, int i)
 {
-	int s;
-
 	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_DDX:
 	case TGSI_OPCODE_DDY:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
@@ -1526,29 +1791,14 @@ negate_supported(const struct tgsi_full_instruction *insn, int i)
 	case TGSI_OPCODE_ADD:
 	case TGSI_OPCODE_SUB:
 	case TGSI_OPCODE_MAD:
-		break;
+		return TRUE;
 	case TGSI_OPCODE_POW:
 		if (i == 1)
-			break;
+			return TRUE;
 		return FALSE;
 	default:
 		return FALSE;
 	}
-
-	/* Watch out for possible multiple uses of an nv50_reg, we
-	 * can't use nv50_reg::neg in these cases.
-	 */
-	for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
-		if (s == i)
-			continue;
-		if ((insn->Src[s].Register.Index ==
-		     insn->Src[i].Register.Index) &&
-		    (insn->Src[s].Register.File ==
-		     insn->Src[i].Register.File))
-			return FALSE;
-	}
-
-	return TRUE;
 }
 
 /* Return a read mask for source registers deduced from opcode & write mask. */
@@ -1576,9 +1826,13 @@ nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
 	case TGSI_OPCODE_RSQ:
 	case TGSI_OPCODE_SCS:
 		return 0x1;
+	case TGSI_OPCODE_IF:
+		return 0x1;
 	case TGSI_OPCODE_LIT:
 		return 0xb;
 	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXL:
 	case TGSI_OPCODE_TXP:
 	{
 		const struct tgsi_instruction_texture *tex;
@@ -1587,13 +1841,17 @@ nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
 		tex = &insn->Texture;
 
 		mask = 0x7;
-		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
-			mask |= 0x8;
+		if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
+		    insn->Instruction.Opcode != TGSI_OPCODE_TXD)
+			mask |= 0x8; /* bias, lod or proj */
 
 		switch (tex->Texture) {
 		case TGSI_TEXTURE_1D:
 			mask &= 0x9;
 			break;
+		case TGSI_TEXTURE_SHADOW1D:
+			mask &= 0x5;
+			break;
 		case TGSI_TEXTURE_2D:
 			mask &= 0xb;
 			break;
@@ -1676,7 +1934,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 			/* Indicate indirection by setting r->acc < 0 and
 			 * use the index field to select the address reg.
 			 */
-			r = MALLOC_STRUCT(nv50_reg);
+			r = reg_instance(pc, NULL);
 			swz = tgsi_util_get_src_register_swizzle(
 						 &src->Indirect, 0);
 			ctor_reg(r, P_CONST,
@@ -1730,6 +1988,8 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		break;
 	}
 
+	if (r && r->acc >= 0 && r != temp)
+		return reg_instance(pc, r);
 	return r;
 }
 
@@ -1785,6 +2045,8 @@ nv50_tgsi_dst_revdep(unsigned op, int s, int c)
 	case TGSI_OPCODE_LIT:
 	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXL:
 	case TGSI_OPCODE_TXP:
 		/* these take care of dangerous swizzles themselves */
 		return 0x0;
@@ -1826,28 +2088,43 @@ nv50_kill_branch(struct nv50_pc *pc)
 	 */
 	if (has_pred(pc->if_insn[lvl], 0xf))
 		return FALSE;
-	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
+	assert(pc->if_insn[lvl] && pc->if_join[lvl]);
 
-	/* We'll use the exec allocated for JOIN_AT (as we can't easily
-	 * update prev's next); if exec_tail is BRK, update the pointer.
+	/* We'll use the exec allocated for JOIN_AT (we can't easily
+	 * access nv50_program_exec's prev).
 	 */
-	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
-		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
-
 	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
 
-	*pc->br_join[lvl] = *pc->p->exec_tail;
+	*pc->if_join[lvl] = *pc->p->exec_tail;
 
 	FREE(pc->if_insn[lvl]);
 	FREE(pc->p->exec_tail);
 
-	pc->p->exec_tail = pc->br_join[lvl];
+	pc->p->exec_tail = pc->if_join[lvl];
 	pc->p->exec_tail->next = NULL;
 	set_pred(pc, 0xd, 0, pc->p->exec_tail);
 
 	return TRUE;
 }
 
+static void
+nv50_fp_move_results(struct nv50_pc *pc)
+{
+	struct nv50_reg reg;
+	unsigned i;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+
+	for (i = 0; i < pc->result_nr * 4; ++i) {
+		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
+			continue;
+		if (pc->result[i].rhw != pc->result[i].hw) {
+			reg.hw = pc->result[i].rhw;
+			emit_mov(pc, &reg, &pc->result[i]);
+		}
+	}
+}
+
 static boolean
 nv50_program_tx_insn(struct nv50_pc *pc,
 		     const struct tgsi_full_instruction *inst)
@@ -1934,13 +2211,13 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		emit_arl(pc, dst[0], temp, 4);
 		break;
 	case TGSI_OPCODE_BGNLOOP:
+		pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc);
 		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
 		terminate_mbb(pc);
 		break;
 	case TGSI_OPCODE_BRK:
-		emit_branch(pc, -1, 0, NULL);
 		assert(pc->loop_lvl > 0);
-		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
+		emit_break(pc, -1, 0);
 		break;
 	case TGSI_OPCODE_CEIL:
 		for (c = 0; c < 4; c++) {
@@ -2016,7 +2293,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mov_immdval(pc, dst[0], 1.0f);
 		break;
 	case TGSI_OPCODE_ELSE:
-		emit_branch(pc, -1, 0, NULL);
+		emit_branch(pc, -1, 0);
 		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
 		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
 		terminate_mbb(pc);
@@ -2028,21 +2305,20 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		if (nv50_kill_branch(pc) == TRUE)
 			break;
 
-		if (pc->br_join[pc->if_lvl]) {
-			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
-			pc->br_join[pc->if_lvl] = NULL;
+		if (pc->if_join[pc->if_lvl]) {
+			pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size;
+			pc->if_join[pc->if_lvl] = NULL;
 		}
 		terminate_mbb(pc);
 		/* emit a NOP as join point, we could set it on the next
 		 * one, but would have to make sure it is long and !immd
 		 */
-		emit_nop(pc);
-		pc->p->exec_tail->inst[1] |= 2;
+		JOIN_ON(emit_nop(pc));
 		break;
 	case TGSI_OPCODE_ENDLOOP:
-		emit_branch(pc, -1, 0, NULL);
-		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
-		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
+		emit_branch(pc, -1, 0)->param.index =
+			pc->loop_pos[--pc->loop_lvl];
+		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
 		terminate_mbb(pc);
 		break;
 	case TGSI_OPCODE_EX2:
@@ -2066,21 +2342,23 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 		break;
 	case TGSI_OPCODE_IF:
-		/* emitting a join_at may not be necessary */
-		assert(pc->if_lvl < MAX_IF_DEPTH);
-		/* set_pred_wr(pc, 1, 0, pc->if_cond); */
+		assert(pc->if_lvl < NV50_MAX_COND_NESTING);
 		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
 			 CVT_F32_F32);
-		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
-		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		pc->if_join[pc->if_lvl] = emit_joinat(pc);
+		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
 		terminate_mbb(pc);
 		break;
 	case TGSI_OPCODE_KIL:
+		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
 		emit_kil(pc, src[0][0]);
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
 		break;
+	case TGSI_OPCODE_KILP:
+		emit_kil(pc, NULL);
+		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
@@ -2137,6 +2415,11 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	case TGSI_OPCODE_RCP:
 		emit_flop(pc, 0, brdc, src[0][0]);
 		break;
+	case TGSI_OPCODE_RET:
+		if (pc->p->type == PIPE_SHADER_FRAGMENT)
+			nv50_fp_move_results(pc);
+		emit_ret(pc, -1, 0);
+		break;
 	case TGSI_OPCODE_RSQ:
 		emit_flop(pc, 2, brdc, src[0][0]);
 		break;
@@ -2187,11 +2470,19 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		break;
 	case TGSI_OPCODE_TEX:
 		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->Texture.Texture, FALSE);
+			 inst->Texture.Texture, FALSE, 0);
+		break;
+	case TGSI_OPCODE_TXB:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, FALSE, -1);
+		break;
+	case TGSI_OPCODE_TXL:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, FALSE, 1);
 		break;
 	case TGSI_OPCODE_TXP:
 		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->Texture.Texture, TRUE);
+			 inst->Texture.Texture, TRUE, 0);
 		break;
 	case TGSI_OPCODE_TRUNC:
 		for (c = 0; c < 4; c++) {
@@ -2245,20 +2536,9 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 	}
 
-	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-		for (c = 0; c < 4; c++) {
-			if (!src[i][c])
-				continue;
-			src[i][c]->mod = 0;
-			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
-				FREE(src[i][c]);
-			else
-			if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
-				FREE(src[i][c]); /* indirect constant */
-		}
-	}
-
 	kill_temp_temp(pc);
+	pc->reg_instance_nr = 0;
+
 	return TRUE;
 }
 
@@ -2541,10 +2821,10 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			const struct tgsi_full_immediate *imm =
 				&tp.FullToken.FullImmediate;
 
-			ctor_immd(pc, imm->u[0].Float,
-				      imm->u[1].Float,
-				      imm->u[2].Float,
-				      imm->u[3].Float);
+			ctor_immd_4f32(pc, imm->u[0].Float,
+				       imm->u[1].Float,
+				       imm->u[2].Float,
+				       imm->u[3].Float);
 		}
 			break;
 		case TGSI_TOKEN_TYPE_DECLARATION:
@@ -2898,24 +3178,6 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 }
 
 static void
-nv50_fp_move_results(struct nv50_pc *pc)
-{
-	struct nv50_reg reg;
-	unsigned i;
-
-	ctor_reg(&reg, P_TEMP, -1, -1);
-
-	for (i = 0; i < pc->result_nr * 4; ++i) {
-		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
-			continue;
-		if (pc->result[i].rhw != pc->result[i].hw) {
-			reg.hw = pc->result[i].rhw;
-			emit_mov(pc, &reg, &pc->result[i]);
-		}
-	}
-}
-
-static void
 nv50_program_fixup_insns(struct nv50_pc *pc)
 {
 	struct nv50_program_exec *e, **bra_list;
@@ -3024,7 +3286,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
 }
 
 static void
-nv50_program_upload_data(struct nv50_context *nv50, float *map,
+nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map,
 			unsigned start, unsigned count, unsigned cbuf)
 {
 	struct nouveau_channel *chan = nv50->screen->base.channel;
@@ -3072,8 +3334,8 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 
 	if (p->param_nr) {
 		unsigned cb;
-		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
-					     PIPE_BUFFER_USAGE_CPU_READ);
+		uint32_t *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
+						PIPE_BUFFER_USAGE_CPU_READ);
 
 		if (p->type == PIPE_SHADER_VERTEX)
 			cb = NV50_CB_PVP;