35 files changed, 6118 insertions, 3196 deletions
diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
index c1d57ca396..6cbbad699e 100644
--- a/src/gallium/drivers/nvfx/Makefile
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -4,7 +4,7 @@ include $(TOP)/configs/current
 LIBNAME = nvfx
 
 C_SOURCES = \
-	nv04_surface_2d.c \
+	nv04_2d.c \
 	nvfx_buffer.c \
 	nvfx_context.c \
 	nvfx_clear.c \
@@ -14,6 +14,7 @@ C_SOURCES = \
 	nv30_fragtex.c \
 	nv40_fragtex.c \
 	nvfx_miptree.c \
+	nvfx_push.c \
 	nvfx_query.c \
 	nvfx_resource.c \
 	nvfx_screen.c \
diff --git a/src/gallium/drivers/nvfx/SConscript b/src/gallium/drivers/nvfx/SConscript
index 02d931b10e..80e3ef2257 100644
--- a/src/gallium/drivers/nvfx/SConscript
+++ b/src/gallium/drivers/nvfx/SConscript
@@ -9,7 +9,7 @@ env.PrependUnique(delete_existing=1, CPPPATH = [
 nvfx = env.ConvenienceLibrary(
     target = 'nvfx',
     source = [
-        'nv04_surface_2d.c',
+        'nv04_2d.c',
         'nvfx_buffer.c',
         'nvfx_context.c',
         'nvfx_clear.c',
@@ -19,6 +19,7 @@ nvfx = env.ConvenienceLibrary(
         'nv30_fragtex.c',
         'nv40_fragtex.c',
         'nvfx_miptree.c',
+        'nvfx_push.c',
         'nvfx_query.c',
         'nvfx_resource.c',
         'nvfx_screen.c',
diff --git a/src/gallium/drivers/nvfx/nv04_2d.c b/src/gallium/drivers/nvfx/nv04_2d.c
new file mode 100644
index 0000000000..c05312219b
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d.c
@@ -0,0 +1,1341 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Ben Skeggs
+ * Copyright 2009 Younes Manton
+ * Copyright 2010 Luca Barbieri
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <nouveau/nouveau_class.h>
+#include <nouveau/nouveau_device.h>
+#include <nouveau/nouveau_pushbuf.h>
+#include <nouveau/nouveau_channel.h>
+#include <nouveau/nouveau_bo.h>
+#include <nouveau/nouveau_notifier.h>
+#include <nouveau/nouveau_grobj.h>
+#include "nv04_2d.h"
+
+/* avoid depending on Mesa/Gallium */
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) !!(x)
+#define unlikely(x) !!(x)
+#endif
+
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+struct nv04_2d_context
+{
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *sifm;
+	struct nouveau_grobj *blit;
+};
+
+static inline int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+static inline int
+util_is_pot(unsigned x)
+{
+   return (x & (x - 1)) == 0;
+}
+
+/* Integer base-2 logarithm, rounded towards zero. */
+static inline unsigned log2i(unsigned i)
+{
+	unsigned r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+//#define NV04_REGION_DEBUG
+
+// Yes, we really want to inline everything, since all the functions are used only once
+#if defined(__GNUC__) && defined(DEBUG)
+#define inline __attribute__((always_inline)) inline
+#endif
+
+static inline unsigned
+nv04_swizzle_bits_square(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+		     (x & 0x002) << 1 |
+		     (x & 0x004) << 2 |
+		     (x & 0x008) << 3 |
+		     (x & 0x010) << 4 |
+		     (x & 0x020) << 5 |
+		     (x & 0x040) << 6 |
+		     (x & 0x080) << 7 |
+		     (x & 0x100) << 8 |
+		     (x & 0x200) << 9 |
+		     (x & 0x400) << 10 |
+		     (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+		     (y & 0x002) << 2 |
+		     (y & 0x004) << 3 |
+		     (y & 0x008) << 4 |
+		     (y & 0x010) << 5 |
+		     (y & 0x020) << 6 |
+		     (y & 0x040) << 7 |
+		     (y & 0x080) << 8 |
+		     (y & 0x100) << 9 |
+		     (y & 0x200) << 10 |
+		     (y & 0x400) << 11 |
+		     (y & 0x800) << 12;
+	return v | u;
+}
+
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static inline unsigned
+nv04_swizzle_bits_2d(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	if(h <= 1)
+		return x;
+	else
+	{
+		unsigned s = MIN2(w, h);
+		unsigned m = s - 1;
+		return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+	}
+}
+
+// general 3D texture case
+static inline unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned z, unsigned w, unsigned h, unsigned d)
+{
+	if(d <= 1)
+		return nv04_swizzle_bits_2d(x, y, w, h);
+	else
+	{
+		// TODO: autogenerate code for all possible texture sizes (13 * 13 * 13 with dims <= 4096) and do a single indirect call
+		unsigned v = 0;
+		w >>= 1;
+		h >>= 1;
+		d >>= 1;
+		for(int i = 0;;)
+		{
+			int oldi = i;
+			if(likely(w))
+			{
+				v |= (x & 1) << i;
+				x >>= 1;
+				w >>= 1;
+				++i;
+			}
+
+			if(likely(h))
+			{
+				v |= (y & 1) << i;
+				y >>= 1;
+				h >>= 1;
+				++i;
+			}
+
+			if(likely(d))
+			{
+				v |= (z & 1) << i;
+				z >>= 1;
+				d >>= 1;
+				++i;
+			}
+
+			if(i == oldi)
+				break;
+		}
+		return v;
+	}
+}
+
+unsigned
+nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	if(rgn->pitch)
+		return rgn->pitch * rgn->y + (rgn->x << rgn->bpps);
+	else
+		return nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d) << rgn->bpps;
+}
+
+unsigned
+nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	if(rgn->pitch)
+		return rgn->pitch * (rgn->y + h - 1) + ((rgn->x + w) << rgn->bpps);
+	else
+		return (nv04_swizzle_bits(rgn->x + w - 1, rgn->y + h - 1, rgn->z, rgn->w, rgn->h, rgn->d) + 1) << rgn->bpps;
+}
+
+// *pitch = -1 -> use 3D swizzling for (x, y), *pitch = 0 -> use 2D swizzling, other *pitch -> use linear calculations
+// returns 2 if pixel order is 3D-swizzled and 1 if subrect is 2D-swizzled
+/* *pitch == -1 ret = 0 -> 3D swizzled subrect
+ * *pitch == 0 ret = 0 -> 2D swizzled subrect
+ * *pitch > 0 ret = 0 -> linear subrect
+ * *pitch > 0 ret = 1 -> linear subrect, but with swizzled 3D data inside
+ */
+
+static inline void
+nv04_region_print(struct nv04_region* rgn)
+{
+	fprintf(stderr, "<%i[%i]> ", rgn->bo->handle, rgn->offset);
+	if(rgn->pitch)
+		fprintf(stderr, "lin %i", rgn->pitch);
+	else
+		fprintf(stderr, "swz %ix%ix%i", rgn->w, rgn->h, rgn->d);
+	fprintf(stderr, " (%i, %i, %i)", rgn->x, rgn->y, rgn->z);
+}
+
+static inline void
+nv04_region_assert(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	unsigned end = rgn->offset + nv04_region_end(rgn, w, h);
+
+	assert(rgn->offset <= (int)rgn->bo->size);
+	assert(end <= rgn->bo->size);
+	(void) end;
+	if(!rgn->pitch) {
+		assert(util_is_pot(rgn->w));
+		assert(util_is_pot(rgn->h));
+	}
+}
+
+/* determine if region can be linearized or fake-linearized */
+static inline int
+nv04_region_is_contiguous(struct nv04_region* rgn, int w, int h)
+{
+	int surf_min;
+	int rect_min;
+
+	if(rgn->pitch)
+		return rgn->pitch == w << rgn->bpps;
+
+	// redundant, but this is the fast path for the common case
+	if(w == rgn->w && h == rgn->h && rgn->d <= 1)
+		return 1;
+
+	// must be POT
+	if((w & (w - 1)) || (h & (h - 1)))
+		return 0;
+
+	// must be aligned
+	if((rgn->x & (w - 1)) || (rgn->y & (h - 1)))
+		return 0;
+
+	if(rgn->d > 1)
+		return 0;
+
+	surf_min = MIN2(rgn->w, rgn->h);
+	rect_min = MIN2(w, h);
+
+	if((rect_min == surf_min) || (w == h) || (w == 2 * h))
+		return 1;
+
+	return 0;
+}
+
+// double the pitch until it is larger than the alignment, or the height becomes odd or 1
+static inline void
+nv04_region_contiguous_shape(struct nv04_region* rgn, int* w, int* h, int align)
+{
+	while(!(*h & 1) && (*w << rgn->bpps) < (1 << align))
+	{
+		*w <<= 1;
+		*h >>= 1;
+	}
+
+	while((*w << rgn->bpps) > 16384 && !(*w & 1))
+	{
+		*w >>= 1;
+		*h <<= 1;
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tCONTIGUOUS %ix%i\n", *w, *h);
+#endif
+}
+
+static inline void
+nv04_region_linearize_contiguous(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	int pos;
+	if(rgn->pitch)
+	{
+		rgn->offset += rgn->y * rgn->pitch + (rgn->x << rgn->bpps);
+		rgn->x = 0;
+		rgn->y = 0;
+	}
+	else
+	{
+		rgn->offset += (rgn->w * rgn->h * rgn->z) << rgn->bpps;
+		pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
+		rgn->x = pos & (w - 1);
+		rgn->y = pos / w;
+	}
+	rgn->pitch = w << rgn->bpps;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tLINEARIZE ");
+	nv04_region_print(rgn);
+	fprintf(stderr, "\n");
+#endif
+}
+
+	/* preserve the offset! */
+	/*
+	rgn->pitch = util_format_get_stride(rgn->format, w);
+	int pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
+	rgn->x = pos & (w - 1);
+	rgn->y = pos & ~(w - 1);
+	*/
+
+	/*
+	rgn->offset +=
+	rgn->pitch = util_format_get_stride(rgn->format, w);
+	rgn->x = 0;
+	rgn->y = 0;
+	*/
+
+/* This code will get used for, and always succeed on:
+ * - 4x2 1bpp swizzled texture mipmap levels
+ * - linear regions created by linearization
+ *
+ * This code will get used for, and MAY work for:
+ * - misaligned texture blanket
+ * - linear surfaces created without wide_pitch (in this case, it will only work if we are lucky)
+ *
+ * The general case requires splitting the region in 2.
+ */
+static inline int
+nv04_region_do_align_offset(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
+{
+	if(rgn->pitch > 0)
+	{
+		int delta;
+
+		assert(!(rgn->offset & ((1 << rgn->bpps) - 1))); // fatal!
+		delta = rgn->offset & ((1 << shift) - 1);
+
+		if(h <= 1)
+		{
+			rgn->x += delta >> rgn->bpps;
+			rgn->offset -= delta;
+			rgn->pitch = align((rgn->x + w) << rgn->bpps, 1 << shift);
+		}
+		else
+		{
+			int newxo = (rgn->x << rgn->bpps) + delta;
+			int dy = newxo / rgn->pitch;
+			newxo -= dy * rgn->pitch;
+			if((newxo + (w << rgn->bpps)) > rgn->pitch)
+			{
+				// TODO: split the region into two rectangles (!) if *really* necessary, unless the hardware actually supports "wrapping" rectangles
+				// this does not happen if the surface is pitch-aligned, which it should always be
+				assert(0);
+				return -1;
+			}
+			rgn->x = newxo >> rgn->bpps;
+			rgn->y += dy;
+		}
+	}
+	else
+	{
+		int size;
+		int min;
+		int v;
+
+		// we don't care about the alignment of 3D surfaces since the 2D engine can't use them
+		if(rgn->d < 0)
+			return -1;
+
+		min = MIN2(rgn->w, rgn->h);
+		size = min * min << rgn->bpps;
+
+		// this is unfixable, and should not be happening
+		if(rgn->offset & (size - 1))
+			return -1;
+
+		v = (rgn->offset & ((1 << shift) - 1)) / size;
+		rgn->offset -= v * size;
+
+		if(rgn->h == min)
+		{
+			unsigned w;
+			rgn->x += rgn->h * v;
+			w = rgn->w + rgn->h * v;
+
+			while(rgn->w < w)
+				rgn->w += rgn->w;
+		}
+		else
+		{
+			unsigned h;
+			rgn->y += rgn->w * v;
+			h = rgn->h + rgn->w * v;
+
+			while(rgn->h < h)
+				rgn->h += rgn->h;
+		}
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tALIGNED ");
+	nv04_region_print(rgn);
+	fprintf(stderr, "\n");
+#endif
+	return 0;
+}
+
+// both pitch and shift
+// will leave the region unchanged if it fails
+static inline int
+nv04_region_align(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
+{
+	if(rgn->pitch & ((1 << shift) - 1))
+	{
+		if(h == 1)
+			goto do_align; /* this will fix pitch too in this case */
+		else
+			return -1;
+	}
+
+	if(rgn->offset & ((1 << shift) - 1))
+	{
+		do_align:
+		if(nv04_region_do_align_offset(rgn, w, h, shift))
+			return -1;
+	}
+	return 0;
+}
+
+/* this contains 22 different copy loops after preprocessing. unfortunately, it's necessary */
+void
+nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h)
+{
+	uint8_t* mdst;
+	uint8_t* msrc;
+	int size;
+
+	if(dst->bo != src->bo)
+	{
+		nouveau_bo_map(dst->bo, NOUVEAU_BO_WR);
+		nouveau_bo_map(src->bo, NOUVEAU_BO_RD);
+	}
+	else
+		nouveau_bo_map(dst->bo, NOUVEAU_BO_WR | NOUVEAU_BO_RD);
+
+	mdst = (uint8_t*)dst->bo->map + dst->offset;
+	msrc = (uint8_t*)src->bo->map + src->offset;
+
+	size = w << dst->bpps;
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_CPU [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+
+//	for(int i = 0; i < 16; ++i)
+//		fprintf(stderr, "%02x ", msrc[i]);
+//	fprintf(stderr, "\n");
+#endif
+
+	// TODO: support overlapping copies!
+	if(src->pitch && dst->pitch)
+	{
+		mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
+		msrc += src->y * src->pitch + (src->x << src->bpps);
+		if(dst->bo != src->bo)
+			goto simple;
+		else if(mdst < msrc)
+		{
+			if(mdst + size <= msrc)
+			{
+simple:
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memcpy(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+			else
+			{
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memmove(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+		}
+		else
+		{
+			/* copy backwards so we don't destroy data we have to read yet */
+			if(msrc + size <= mdst)
+			{
+				for(int iy = h - 1; iy >= 0; --iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memcpy(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+			else
+			{
+				for(int iy = h - 1; iy >= 0; --iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memmove(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+		}
+	}
+	else
+	{
+		int* dswx = NULL;
+		int* dswy = NULL;
+		int* sswx = NULL;
+		int* sswy = NULL;
+		int dir;
+
+		if(!dst->pitch)
+		{
+			dswx = alloca(w * sizeof(int));
+			for(int ix = 0; ix < w; ++ix) // we are adding, so z cannot be contributed by both
+				dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, 0, dst->w, dst->h, dst->d);
+			dswy = alloca(h * sizeof(int));
+			for(int iy = 0; iy < h; ++iy)
+				dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
+		}
+
+		if(!src->pitch)
+		{
+			sswx = alloca(w * sizeof(int));
+			for(int ix = 0; ix < w; ++ix)
+				sswx[ix] = nv04_swizzle_bits(src->x + ix, 0, 0, src->w, src->h, src->d);
+			sswy = alloca(h * sizeof(int));
+			for(int iy = 0; iy < h; ++iy)
+				sswy[iy] = nv04_swizzle_bits(0, src->y + iy, src->z, src->w, src->h, src->d);
+		}
+
+		dir = 1;
+		/* do backwards copies for overlapping swizzled surfaces */
+		if(dst->pitch == src->pitch && dst->offset == src->offset)
+		{
+			if(dst->y > src->y || (dst->y == src->y && dst->x > src->x))
+				dir = -1;
+		}
+
+#define SWIZZLED_COPY_LOOPS
+		if(dir == 1)
+		{
+			int dir = 1;
+#define LOOP_Y for(int iy = 0; iy < h; ++iy)
+#define LOOP_X for(int ix = 0; ix < w; ++ix)
+#include "nv04_2d_loops.h"
+#undef LOOP_X
+#undef LOOP_Y
+		}
+		else
+		{
+			int dir = -1;
+#define LOOP_Y for(int iy = h - 1; iy >= 0; --iy)
+#define LOOP_X for(int ix = w - 1; ix >= 0; --ix)
+#include "nv04_2d_loops.h"
+#undef LOOP_X
+#undef LOOP_Y
+		}
+#undef SWIZZLED_COPY_LOOP
+	}
+
+	if(src->bo != dst->bo)
+		nouveau_bo_unmap(src->bo);
+	nouveau_bo_unmap(dst->bo);
+}
+
+/* TODO: if the destination is swizzled, we are doing random writes, which causes write combining to fail
+ * the alternative is to read, modify and copy back, which may or may not be faster
+ * loading 3D textures is a common case that hits this and could probably benefit from the temporary
+ */
+void
+nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value)
+{
+	uint8_t* mdst = (nouveau_bo_map(dst->bo, NOUVEAU_BO_WR), (uint8_t*)dst->bo->map + dst->offset);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_FILL_CPU ");
+	nv04_region_print(dst);
+	fprintf(stderr, "\n");
+#endif
+
+	nv04_region_assert(dst, w, h);
+
+	if(dst->pitch)
+	{
+		unsigned size = w << dst->bpps;
+
+#define FILL(T) do { \
+			for(int iy = 0; iy < h; ++iy) \
+			{ \
+				assert((char*)((T*)mdst + w) <= (char*)dst->bo->map + dst->bo->size); \
+				for(int ix = 0; ix < w; ++ix) \
+					((T*)mdst)[ix] = (T)value; \
+				mdst += dst->pitch; \
+			} \
+		} while(0)
+
+		mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
+
+		if(dst->bpps == 0)
+		{
+ms:
+			assert(mdst + size * h <= (uint8_t*)dst->bo->map + dst->bo->size);
+			if(size == dst->pitch)
+				memset(mdst, (uint8_t)value, size * h);
+			else
+			{
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					memset(mdst, (uint8_t)value, size);
+					mdst += dst->pitch;
+				}
+			}
+		}
+		else if(dst->bpps == 1)
+		{
+			if(!((uint8_t)value ^ (uint8_t)(value >> 8)))
+				goto ms;
+
+			FILL(uint16_t);
+		}
+		else if(dst->bpps == 2)
+		{
+			if(value == (uint8_t)value * 0x1010101)
+				goto ms;
+			FILL(uint32_t);
+		}
+		else
+			assert(0);
+#undef FILL
+	}
+	else
+	{
+		int* dswx;
+		int* dswy;
+
+		dswx = alloca(w * sizeof(int));
+		for(int ix = 0; ix < w; ++ix)
+			dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, dst->z, dst->w, dst->h, dst->d);
+		dswy = alloca(h * sizeof(int));
+		for(int iy = 0; iy < h; ++iy)
+			dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
+
+#define FILL(T) do { \
+			T tvalue = (T)value; \
+			for(int iy = 0; iy < h; ++iy) \
+			{ \
+				T* pdst = (T*)mdst + dswy[iy]; \
+				for(int ix = 0; ix < w; ++ix) \
+				{ \
+					assert((uint8_t*)&pdst[dswx[ix] + 1] <= (uint8_t*)dst->bo->map + dst->bo->size); \
+					pdst[dswx[ix]] = tvalue; \
+				} \
+			} \
+		} while(0)
+
+		if(dst->bpps == 0)
+			FILL(uint8_t);
+		else if(dst->bpps == 1)
+			FILL(uint16_t);
+		else if(dst->bpps == 2)
+			FILL(uint32_t);
+		else
+			assert(0 && "unhandled bpp");
+#undef FILL
+	}
+
+	nouveau_bo_unmap(dst->bo);
+}
+
+static void
+nv04_region_copy_swizzle(struct nv04_2d_context *ctx,
+			  struct nv04_region* dst,
+			  struct nv04_region* src,
+			  int w, int h, int cs2d_format, int sifm_format)
+{
+	struct nouveau_channel *chan = ctx->swzsurf->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	/* Max width & height may not be the same on all HW, but must be POT */
+	unsigned max_shift = 10;
+	unsigned cw = 1 << max_shift;
+	unsigned ch = 1 << max_shift;
+	unsigned sx = dst->x >> max_shift;
+	unsigned sy = dst->y >> max_shift;
+	unsigned ex = (dst->x + w - 1) >> max_shift;
+	unsigned ey = (dst->y + h - 1) >> max_shift;
+	unsigned chunks = (ex - sx + 1) * (ey - sy + 1);
+	unsigned chunk_size;
+	if(dst->w < cw)
+		cw = dst->w;
+	if(dst->h < ch)
+		ch = dst->h;
+	chunk_size = cw * ch << dst->bpps;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_SWIZZLE [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+	MARK_RING (chan, 8 + chunks * 17, 2 + chunks * 2);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst->bo,
+			NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, cs2d_format |
+			 log2i(cw) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+			 log2i(ch) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+
+	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src->bo,
+			 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	assert(!(dst->offset & 63));
+
+	for (int cy = sy; cy <= ey; ++cy) {
+	  int ry = MAX2(0, (int)(dst->y - ch * cy));
+	  int rh = MIN2((int)ch, (int)(dst->y - ch * cy + h)) - ry;
+	  for (int cx = sx; cx <= ex; ++cx) {
+	    int rx = MAX2(0, (int)(dst->x - cw * cx));
+	    int rw = MIN2((int)cw, (int)(dst->x - cw * cx + w)) - rx;
+	    unsigned dst_offset;
+	    unsigned src_offset;
+
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+
+	    dst_offset = dst->offset + (nv04_swizzle_bits_2d(cx * cw, cy * ch, dst->w, dst->h) << dst->bpps);
+	    assert(dst_offset <= dst->bo->size);
+	    assert(dst_offset + chunk_size <= dst->bo->size);
+	    OUT_RELOCl(chan, dst->bo, dst_offset,
+			    NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, sifm_format);
+	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | rw);
+	    OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | rw);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | align(rw, 8));
+	    OUT_RING  (chan, src->pitch |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    src_offset = src->offset + (cy * ch + ry + src->y - dst->y) * src->pitch + ((cx * cw + rx + src->x - dst->x) << src->bpps);
+	    assert(src_offset <= src->bo->size);
+	    assert(src_offset + (src->pitch * (rh - 1)) + (rw << src->bpps) <= src->bo->size);
+	    OUT_RELOCl(chan, src->bo, src_offset,
+			     NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+}
+
+static inline void
+nv04_copy_m2mf_begin(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, struct nouveau_bo* srcbo, unsigned commands)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	MARK_RING (chan, 3 + commands * 9, 2 + commands * 2);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, srcbo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dstbo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+}
+
+static inline void
+nv04_copy_m2mf_body(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int* pdstoff, unsigned dstpitch, struct nouveau_bo* srcbo, int* psrcoff, unsigned srcpitch, unsigned size, unsigned lines)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\t\t\tCOPY_M2MF_BODY [%i, %i] <%i[%u]> lin %u <- <%i[%u]> lin %u\n", size, lines, dstbo->handle, *pdstoff, dstpitch, srcbo->handle, *psrcoff, srcpitch);
+#endif
+
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+	OUT_RELOCl(chan, srcbo, *psrcoff,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dstbo, *pdstoff,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+	OUT_RING  (chan, srcpitch);
+	OUT_RING  (chan, dstpitch);
+	OUT_RING  (chan, size);
+	OUT_RING  (chan, lines);
+	OUT_RING  (chan, 0x0101);
+	OUT_RING  (chan, 0);
+
+	*psrcoff += srcpitch * lines;
+	*pdstoff += dstpitch * lines;
+}
+
+static void
+nv04_copy_m2mf(struct nv04_2d_context *ctx,
+		struct nouveau_bo* dstbo, int dstoff, unsigned dstpitch,
+		struct nouveau_bo* srcbo, int srcoff, unsigned srcpitch,
+		unsigned size, unsigned h)
+{
+	unsigned max_pitch = 32767;
+	unsigned max_lines = 2047;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\t\tCOPY_M2MF [%i, %i] <%i[%i]> lin %u <- <%i[%i]> lin %u\n", size, h, dstbo->handle, dstoff, dstpitch, srcbo->handle, srcoff, srcpitch);
+#endif
+
+	if(srcpitch <= max_pitch && dstpitch <= max_pitch)
+	{
+		unsigned full_pages = h / max_lines;
+		unsigned leftover_lines = h - full_pages * max_lines;
+
+		nv04_copy_m2mf_begin(ctx, dstbo, srcbo, full_pages + !!leftover_lines);
+
+		for(unsigned i = 0; i < full_pages; ++i)
+			nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, max_lines);
+
+		if(leftover_lines)
+			nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, leftover_lines);
+	}
+	else
+	{
+		unsigned lines = size / max_pitch;
+		unsigned leftover = size - lines * max_pitch;
+		unsigned full_pages = lines / max_lines;
+		unsigned leftover_lines = lines - full_pages * max_lines;
+		unsigned srcgap = srcpitch - size;
+		unsigned dstgap = dstpitch - size;
+
+		nv04_copy_m2mf_begin(ctx, dstbo, srcbo, h * (full_pages + !!leftover_lines + !!leftover));
+
+		for(unsigned i = 0; i < h; ++i)
+		{
+			for(unsigned j = 0; j < full_pages; ++j)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, max_lines);
+
+			if(leftover_lines)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, leftover_lines);
+
+			if(leftover)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, leftover, srcbo, &srcoff, leftover, leftover, 1);
+
+			srcoff += srcgap;
+			dstoff += dstgap;
+		}
+	}
+}
+
+void
+nv04_memcpy(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int dstoff, struct nouveau_bo* srcbo, int srcoff, unsigned size)
+{
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tMEMCPY [%i] <%i[%i]> <- <%i[%i]>\n", size, dstbo->handle, dstoff, srcbo->handle, srcoff);
+#endif
+
+	nv04_copy_m2mf(ctx, dstbo, dstoff, size, srcbo, srcoff, size, size, 1);
+}
+
+static void
+nv04_region_copy_m2mf(struct nv04_2d_context *ctx, struct nv04_region *dst, struct nv04_region *src, int w, int h)
+{
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_M2MF [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+	assert(src->pitch);
+	assert(dst->pitch);
+
+	nv04_copy_m2mf(ctx,
+			dst->bo, dst->offset + dst->y * dst->pitch + (dst->x << dst->bpps), dst->pitch,
+			src->bo, src->offset + src->y * src->pitch + (src->x << src->bpps), src->pitch,
+			w << src->bpps, h);
+}
+
+static inline void
+nv04_region_copy_blit(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src, int w, int h, int format)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_BLIT [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	assert(!(src->pitch & 63) && src->pitch);
+	assert(!(dst->pitch & 63) && dst->pitch);
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+	MARK_RING (chan, 12, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst->pitch << 16) | src->pitch);
+	OUT_RELOCl(chan, src->bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (src->y << 16) | src->x);
+	OUT_RING  (chan, (dst->y << 16) | dst->x);
+	OUT_RING  (chan, ( h << 16) |  w);
+}
+
+/* THEOREM: a non-linearizable swizzled destination is always 64 byte aligned, except for 4x2 mipmap levels of swizzled 1bpp surfaces
+ * HYPOTESIS:
+ * 1. The first mipmap level is 64-byte-aligned
+ * PROOF:
+ * 1. Thus, all mipmaps level with a parent which is 64-byte or more in size are.
+ * 2. At 1bpp, the smallest levels with a <= 32-byte parent are either Nx1 or 1xN or size <=8, thus 4x2, 2x2 or 2x4
+ * 3. Nx1, 1xN, 2x4, 2x2 have all subrects linearizable. 4x2 does not.
+ * 4. At 2/4bpp or more, the smallest levels with a 32-byte parent are 1xN, Nx1 or 2x2
+ *
+ * However, nv04_region_align handles that.
+ */
+
+// 0 -> done, 1 -> do with 3D engine or CPU, -1 -> do with CPU
+// dst and src may be modified, and the possibly modified version should be passed to nv04_region_cpu if necessary
+int
+nv04_region_copy_2d(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src,
+		int w, int h, int cs2d_format, int sifm_format, int dst_to_gpu, int src_on_gpu)
+{
+	assert(src->bpps == dst->bpps);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "RGN_COPY%s [%i, %i: %i] ", (cs2d_format >= 0) ? "_2D" : "_NO2D", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		int gpu = i ? src_on_gpu : dst_to_gpu;
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, " %s", gpu ? "gpu" : "cpu");
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	// if they are contiguous and either both swizzled or both linear, reshape
+	if(!dst->pitch == !src->pitch
+		&& nv04_region_is_contiguous(dst, w, h)
+		&& nv04_region_is_contiguous(src, w, h))
+	{
+		nv04_region_contiguous_shape(dst, &w, &h, 6);
+		nv04_region_linearize_contiguous(dst, w, h);
+		nv04_region_linearize_contiguous(src, w, h);
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tOPT ");
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	/* if the destination is not for GPU _and_ source is on CPU, use CPU */
+	/* if the destination is not for GPU _or_ source is on CPU, use CPU only if we think it's faster than the GPU */
+	/* TODO: benchmark to find out in which cases exactly we should prefer the CPU */
+	 if((!dst_to_gpu && !src_on_gpu)
+		|| (!dst->pitch && dst->d > 1)
+		/* 3D swizzled destination are unwritable by the GPU, and 2D swizzled ones are readable only by the 3D engine */
+	 )
+		 return -1;
+	/* there is no known way to read 2D/3D-swizzled surfaces with the 2D engine
+	 * ask the caller to use the 3D engine
+	 * If a format cannot be sampled from the 3D engine there is no point in making it swizzled, so we must not do so
+	 */
+	 else if(!src->pitch)
+	 {
+#ifdef NV04_REGION_DEBUG
+		fprintf(stderr, "\tCOPY_ENG3D\n");
+#endif
+		 return 1;
+	 }
+	/* Setup transfer to swizzle the texture to vram if needed */
+	else
+	{
+		if (!dst->pitch)
+		{
+			if(cs2d_format < 0 || sifm_format < 0 || !dst_to_gpu)
+			{
+#ifdef NV04_REGION_DEBUG
+				fprintf(stderr, "\tCOPY_ENG3D\n");
+#endif
+				return 1;
+			}
+			else
+			{
+				assert(!nv04_region_align(dst, w, h, 6));
+
+				nv04_region_copy_swizzle(ctx, dst, src, w, h, cs2d_format, sifm_format);
+				return 0;
+			}
+		}
+		else
+		{
+			/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+			 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+			 * TODO: is this also true for the source? possibly not
+			 */
+
+			if ((cs2d_format < 0)
+				|| !dst_to_gpu
+				|| nv04_region_align(src, w, h, 6)
+				|| nv04_region_align(dst, w, h, 6)
+				)
+				nv04_region_copy_m2mf(ctx, dst, src, w, h);
+			else
+				nv04_region_copy_blit(ctx, dst, src, w, h, cs2d_format);
+
+			return 0;
+		}
+	}
+}
+
+static inline void
+nv04_region_fill_gdirect(struct nv04_2d_context *ctx, struct nv04_region* dst, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	int cs2d_format, gdirect_format;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tFILL_GDIRECT\n");
+#endif
+
+	assert(!(dst->pitch & 63) && dst->pitch);
+	nv04_region_assert(dst, w, h);
+
+	if(dst->bpps == 0)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	}
+	else if(dst->bpps == 1)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
+	}
+	else if(dst->bpps == 2)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	}
+	else
+	{
+		assert(0);
+		gdirect_format = 0;
+		cs2d_format = 0;
+	}
+
+	MARK_RING (chan, 15, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst->pitch << 16) | dst->pitch);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dst->x << 16) | dst->y);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+int
+nv04_region_fill_2d(struct nv04_2d_context *ctx, struct nv04_region *dst,
+		  int w, int h, unsigned value)
+{
+	if(!w || !h)
+		return 0;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "FILL [%i, %i: %i] ", w, h, dst->bpps);
+	nv04_region_print(dst);
+	fprintf(stderr, " <- 0x%x\n", value);
+#endif
+
+	if(nv04_region_is_contiguous(dst, w, h))
+	{
+		nv04_region_contiguous_shape(dst, &w, &h, 6);
+		nv04_region_linearize_contiguous(dst, w, h);
+	}
+
+	// TODO: maybe do intermediate copies for some cases instead of using the 3D engine/CPU
+	/* GdiRect doesn't work together with swzsurf, so the 3D engine, or an intermediate copy, is the only option here */
+	if(!dst->pitch)
+	{
+#ifdef NV04_REGION_DEBUG
+		fprintf(stderr, "\tFILL_ENG3D\n");
+#endif
+		return 1;
+	}
+	else if(!nv04_region_align(dst, w, h, 6))
+	{
+		nv04_region_fill_gdirect(ctx, dst, w, h, value);
+		return 0;
+	}
+	else
+		return -1;
+}
+
+
+void
+nv04_2d_context_takedown(struct nv04_2d_context *ctx)
+{
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	free(ctx);
+}
+
+struct nv04_2d_context *
+nv04_2d_context_init(struct nouveau_channel* chan)
+{
+	struct nv04_2d_context *ctx = calloc(1, sizeof(struct nv04_2d_context));
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	/* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
+	if(ctx->swzsurf->bound == NOUVEAU_GROBJ_UNBOUND)
+		nouveau_grobj_autobind(ctx->swzsurf);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x30:
+		class = NV30_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	default:
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	/* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
+	if(ctx->sifm->bound == NOUVEAU_GROBJ_UNBOUND)
+		nouveau_grobj_autobind(ctx->sifm);
+
+	return ctx;
+}
diff --git a/src/gallium/drivers/nvfx/nv04_2d.h b/src/gallium/drivers/nvfx/nv04_2d.h
new file mode 100644
index 0000000000..e638b8c874
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d.h
@@ -0,0 +1,87 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Ben Skeggs
+ * Copyright 2009 Younes Manton
+ * Copyright 2010 Luca Barbieri
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
+
+#ifndef __NV04_2D_H__
+#define __NV04_2D_H__
+
+struct nv04_2d_context;
+struct nouveau_channel;
+struct nouveau_bo;
+
+// NOTE: all functions taking this as a parameter will CLOBBER it (except for ->bo)
+struct nv04_region {
+	struct nouveau_bo* bo;
+	int offset;
+	unsigned pitch; // 0 -> swizzled
+	unsigned bpps; // bpp shift (0, 1, 2; 3, 4 for fp/compressed)
+	unsigned x, y, z;
+	unsigned w, h, d;
+};
+
+void
+nv04_memcpy(struct nv04_2d_context *ctx,
+		struct nouveau_bo* dstbo, int dstoff,
+		struct nouveau_bo* srcbo, int srcoff,
+		unsigned size);
+
+unsigned
+nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h);
+
+unsigned
+nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h);
+
+void
+nv04_2d_context_takedown(struct nv04_2d_context *pctx);
+
+struct nv04_2d_context *
+nv04_2d_context_init(struct nouveau_channel* chan);
+
+void
+nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h);
+
+void
+nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value);
+
+int
+nv04_region_copy_2d(struct nv04_2d_context *ctx,
+		struct nv04_region* dst, struct nv04_region* src,
+		int w, int h,
+		int cs2d_format, int sifm_format,
+		int dst_to_gpu, int src_on_gpu);
+
+int
+nv04_region_fill_2d(struct nv04_2d_context *ctx,
+		struct nv04_region *dst,
+                int w, int h,
+                unsigned value);
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv04_2d_loops.h b/src/gallium/drivers/nvfx/nv04_2d_loops.h
new file mode 100644
index 0000000000..3a6787c071
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d_loops.h
@@ -0,0 +1,70 @@
+#ifndef T
+{
+	if(dst->bpps == 0)
+#define T uint8_t
+#include "nv04_2d_loops.h"
+#undef T
+	else if(dst->bpps == 1)
+#define T uint16_t
+#include "nv04_2d_loops.h"
+#undef T
+	else if(dst->bpps == 2)
+#define T uint32_t
+#include "nv04_2d_loops.h"
+#undef T
+	else
+		assert(0);
+}
+#else
+#ifdef SWIZZLED_COPY_LOOPS
+{
+	if(!dst->pitch)
+	{
+		if(!src->pitch)
+		{
+			LOOP_Y
+			{
+				T* pdst = (T*)mdst + dswy[iy];
+				T* psrc = (T*)msrc + sswy[iy];
+				LOOP_X
+				{
+					assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
+					assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
+					pdst[dswx[ix]] = psrc[sswx[ix]];
+				}
+			}
+		}
+		else
+		{
+			T* psrc = (T*)(msrc + ((dir > 0) ? src->y : (src->y + h - 1)) * src->pitch) + src->x;
+			LOOP_Y
+			{
+				T* pdst = (T*)mdst + dswy[iy];
+				LOOP_X
+				{
+					assert((char*)&psrc[ix + 1] <= ((char*)src->bo->map + src->bo->size));
+					assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
+					pdst[dswx[ix]] = psrc[ix];
+				}
+				psrc = (T*)((char*)psrc + dir * src->pitch);
+			}
+		}
+	}
+	else
+	{
+		T* pdst = (T*)(mdst + ((dir > 0) ? dst->y : (dst->y + h - 1)) * dst->pitch) + dst->x;
+		LOOP_Y
+		{
+			T* psrc = (T*)msrc + sswy[iy];
+			LOOP_X
+			{
+				assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
+				assert((char*)&pdst[ix + 1] <= ((char*)dst->bo->map + dst->bo->size));
+				pdst[ix] = psrc[sswx[ix]];
+			}
+			pdst = (T*)((char*)pdst + dir * dst->pitch);
+		}
+	}
+}
+#endif
+#endif
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.c b/src/gallium/drivers/nvfx/nv04_surface_2d.c
deleted file mode 100644
index 7acbb505df..0000000000
--- a/src/gallium/drivers/nvfx/nv04_surface_2d.c
+++ /dev/null
@@ -1,532 +0,0 @@
-#include "pipe/p_context.h"
-#include "pipe/p_format.h"
-#include "util/u_format.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-#include "nouveau/nouveau_winsys.h"
-#include "nouveau/nouveau_util.h"
-#include "nouveau/nouveau_screen.h"
-#include "nv04_surface_2d.h"
-
-static INLINE int
-nv04_surface_format(enum pipe_format format)
-{
-	switch (format) {
-	case PIPE_FORMAT_A8_UNORM:
-	case PIPE_FORMAT_L8_UNORM:
-	case PIPE_FORMAT_I8_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
-	case PIPE_FORMAT_R16_SNORM:
-	case PIPE_FORMAT_B5G6R5_UNORM:
-	case PIPE_FORMAT_Z16_UNORM:
-	case PIPE_FORMAT_L8A8_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
-	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-	case PIPE_FORMAT_X8Z24_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
-	default:
-		return -1;
-	}
-}
-
-static INLINE int
-nv04_rect_format(enum pipe_format format)
-{
-	switch (format) {
-	case PIPE_FORMAT_A8_UNORM:
-		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
-	case PIPE_FORMAT_B5G6R5_UNORM:
-	case PIPE_FORMAT_L8A8_UNORM:
-	case PIPE_FORMAT_Z16_UNORM:
-		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-	case PIPE_FORMAT_X8Z24_UNORM:
-		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
-	default:
-		return -1;
-	}
-}
-
-static INLINE int
-nv04_scaled_image_format(enum pipe_format format)
-{
-	switch (format) {
-	case PIPE_FORMAT_A8_UNORM:
-	case PIPE_FORMAT_L8_UNORM:
-	case PIPE_FORMAT_I8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
-	case PIPE_FORMAT_B5G5R5A1_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
-	case PIPE_FORMAT_B5G6R5_UNORM:
-	case PIPE_FORMAT_R16_SNORM:
-	case PIPE_FORMAT_L8A8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
-	default:
-		return -1;
-	}
-}
-
-static INLINE unsigned
-nv04_swizzle_bits_square(unsigned x, unsigned y)
-{
-	unsigned u = (x & 0x001) << 0 |
-	             (x & 0x002) << 1 |
-	             (x & 0x004) << 2 |
-	             (x & 0x008) << 3 |
-	             (x & 0x010) << 4 |
-	             (x & 0x020) << 5 |
-	             (x & 0x040) << 6 |
-	             (x & 0x080) << 7 |
-	             (x & 0x100) << 8 |
-	             (x & 0x200) << 9 |
-	             (x & 0x400) << 10 |
-	             (x & 0x800) << 11;
-
-	unsigned v = (y & 0x001) << 1 |
-	             (y & 0x002) << 2 |
-	             (y & 0x004) << 3 |
-	             (y & 0x008) << 4 |
-	             (y & 0x010) << 5 |
-	             (y & 0x020) << 6 |
-	             (y & 0x040) << 7 |
-	             (y & 0x080) << 8 |
-	             (y & 0x100) << 9 |
-	             (y & 0x200) << 10 |
-	             (y & 0x400) << 11 |
-	             (y & 0x800) << 12;
-	return v | u;
-}
-
-/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
-static INLINE unsigned
-nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
-{
-	unsigned s = MIN2(w, h);
-	unsigned m = s - 1;
-	return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
-}
-
-static int
-nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
-			  struct pipe_surface *dst, int dx, int dy,
-			  struct pipe_surface *src, int sx, int sy,
-			  int w, int h)
-{
-	struct nouveau_channel *chan = ctx->swzsurf->channel;
-	struct nouveau_grobj *swzsurf = ctx->swzsurf;
-	struct nouveau_grobj *sifm = ctx->sifm;
-	struct nouveau_bo *src_bo = ctx->buf(src);
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	const unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
-        /* Max width & height may not be the same on all HW, but must be POT */
-	const unsigned max_w = 1024;
-	const unsigned max_h = 1024;
-	unsigned sub_w = w > max_w ? max_w : w;
-	unsigned sub_h = h > max_h ? max_h : h;
-	unsigned x;
-	unsigned y;
-
-        /* Swizzled surfaces must be POT  */
-	assert(util_is_pot(dst->width) && util_is_pot(dst->height));
-
-        /* If area is too large to copy in one shot we must copy it in POT chunks to meet alignment requirements */
-	assert(sub_w == w || util_is_pot(sub_w));
-	assert(sub_h == h || util_is_pot(sub_h));
-
-	MARK_RING (chan, 8 + ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*17, 2 +
-			 ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*2);
-
-	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
-	OUT_RELOCo(chan, dst_bo,
-	                 NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
-	OUT_RING  (chan, nv04_surface_format(dst->format) |
-	                 log2i(dst->width) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
-	                 log2i(dst->height) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
-
-	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
-	OUT_RELOCo(chan, src_bo,
-	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
-	OUT_RING  (chan, swzsurf->handle);
-
-	for (y = 0; y < h; y += sub_h) {
-	  sub_h = MIN2(sub_h, h - y);
-
-	  for (x = 0; x < w; x += sub_w) {
-	    sub_w = MIN2(sub_w, w - x);
-
-	    assert(!(dst->offset & 63));
-
-	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	    OUT_RELOCl(chan, dst_bo, dst->offset,
-                             NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
-	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
-	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
-	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
-	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
-	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
-	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, 1 << 20);
-	    OUT_RING  (chan, 1 << 20);
-
-	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
-	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, src_pitch |
-			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
-			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
-	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src->texture->format),
-                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	    OUT_RING  (chan, 0);
-	  }
-	}
-
-	return 0;
-}
-
-static int
-nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
-		       struct pipe_surface *dst, int dx, int dy,
-		       struct pipe_surface *src, int sx, int sy, int w, int h)
-{
-	struct nouveau_channel *chan = ctx->m2mf->channel;
-	struct nouveau_grobj *m2mf = ctx->m2mf;
-	struct nouveau_bo *src_bo = ctx->buf(src);
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
-	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
-	unsigned dst_offset = dst->offset + dy * dst_pitch +
-	                      dx * util_format_get_blocksize(dst->texture->format);
-	unsigned src_offset = src->offset + sy * src_pitch +
-	                      sx * util_format_get_blocksize(src->texture->format);
-
-	MARK_RING (chan, 3 + ((h / 2047) + 1) * 9, 2 + ((h / 2047) + 1) * 2);
-	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
-	OUT_RELOCo(chan, src_bo,
-		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RELOCo(chan, dst_bo,
-		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	while (h) {
-		int count = (h > 2047) ? 2047 : h;
-
-		BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
-		OUT_RELOCl(chan, src_bo, src_offset,
-			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		OUT_RELOCl(chan, dst_bo, dst_offset,
-			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
-		OUT_RING  (chan, src_pitch);
-		OUT_RING  (chan, dst_pitch);
-		OUT_RING  (chan, w * util_format_get_blocksize(src->texture->format));
-		OUT_RING  (chan, count);
-		OUT_RING  (chan, 0x0101);
-		OUT_RING  (chan, 0);
-
-		h -= count;
-		src_offset += src_pitch * count;
-		dst_offset += dst_pitch * count;
-	}
-
-	return 0;
-}
-
-static int
-nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
-		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
-		       int w, int h)
-{
-	struct nouveau_channel *chan = ctx->surf2d->channel;
-	struct nouveau_grobj *surf2d = ctx->surf2d;
-	struct nouveau_grobj *blit = ctx->blit;
-	struct nouveau_bo *src_bo = ctx->buf(src);
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
-	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
-	int format;
-
-	format = nv04_surface_format(dst->format);
-	if (format < 0)
-		return 1;
-
-	MARK_RING (chan, 12, 4);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
-	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
-	OUT_RING  (chan, format);
-	OUT_RING  (chan, (dst_pitch << 16) | src_pitch);
-	OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	BEGIN_RING(chan, blit, 0x0300, 3);
-	OUT_RING  (chan, (sy << 16) | sx);
-	OUT_RING  (chan, (dy << 16) | dx);
-	OUT_RING  (chan, ( h << 16) |  w);
-
-	return 0;
-}
-
-static void
-nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
-		  int dx, int dy, struct pipe_surface *src, int sx, int sy,
-		  int w, int h)
-{
-	int src_linear = src->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
-	int dst_linear = dst->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
-
-	assert(src->format == dst->format);
-
-	/* Setup transfer to swizzle the texture to vram if needed */
-        if (src_linear && !dst_linear && w > 1 && h > 1) {
-           nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
-           return;
-        }
-
-        /* Use M2MF instead of the blitter since it always works
-         * Any possible performance drop is likely to be not very significant
-         * and dwarfed anyway by the current buffer management problems
-         */
-        nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h);
-}
-
-static void
-nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
-		  int dx, int dy, int w, int h, unsigned value)
-{
-	struct nouveau_channel *chan = ctx->surf2d->channel;
-	struct nouveau_grobj *surf2d = ctx->surf2d;
-	struct nouveau_grobj *rect = ctx->rect;
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
-	int cs2d_format, gdirect_format;
-
-	cs2d_format = nv04_surface_format(dst->format);
-	assert(cs2d_format >= 0);
-
-	gdirect_format = nv04_rect_format(dst->format);
-	assert(gdirect_format >= 0);
-
-	MARK_RING (chan, 16, 4);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
-	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
-	OUT_RING  (chan, cs2d_format);
-	OUT_RING  (chan, (dst_pitch << 16) | dst_pitch);
-	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
-	OUT_RING  (chan, gdirect_format);
-	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
-	OUT_RING  (chan, value);
-	BEGIN_RING(chan, rect,
-		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
-	OUT_RING  (chan, (dx << 16) | dy);
-	OUT_RING  (chan, ( w << 16) |  h);
-}
-
-void
-nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
-{
-	struct nv04_surface_2d *ctx;
-
-	if (!pctx || !*pctx)
-		return;
-	ctx = *pctx;
-	*pctx = NULL;
-
-	nouveau_notifier_free(&ctx->ntfy);
-	nouveau_grobj_free(&ctx->m2mf);
-	nouveau_grobj_free(&ctx->surf2d);
-	nouveau_grobj_free(&ctx->swzsurf);
-	nouveau_grobj_free(&ctx->rect);
-	nouveau_grobj_free(&ctx->blit);
-	nouveau_grobj_free(&ctx->sifm);
-
-	FREE(ctx);
-}
-
-struct nv04_surface_2d *
-nv04_surface_2d_init(struct nouveau_screen *screen)
-{
-	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
-	struct nouveau_channel *chan = screen->channel;
-	unsigned handle = 0x88000000, class;
-	int ret;
-
-	if (!ctx)
-		return NULL;
-
-	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
-	OUT_RING  (chan, ctx->ntfy->handle);
-
-	if (chan->device->chipset < 0x10)
-		class = NV04_CONTEXT_SURFACES_2D;
-	else
-		class = NV10_CONTEXT_SURFACES_2D;
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->surf2d,
-			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
-	OUT_RING  (chan, chan->vram->handle);
-	OUT_RING  (chan, chan->vram->handle);
-
-	if (chan->device->chipset < 0x10)
-		class = NV04_IMAGE_BLIT;
-	else
-		class = NV12_IMAGE_BLIT;
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
-	OUT_RING  (chan, ctx->ntfy->handle);
-	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
-	OUT_RING  (chan, ctx->surf2d->handle);
-	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
-	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
-
-	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
-				  &ctx->rect);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
-	OUT_RING  (chan, ctx->ntfy->handle);
-	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
-	OUT_RING  (chan, ctx->surf2d->handle);
-	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
-	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
-	BEGIN_RING(chan, ctx->rect,
-			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
-	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
-
-	switch (chan->device->chipset & 0xf0) {
-	case 0x00:
-	case 0x10:
-		class = NV04_SWIZZLED_SURFACE;
-		break;
-	case 0x20:
-		class = NV20_SWIZZLED_SURFACE;
-		break;
-	case 0x30:
-		class = NV30_SWIZZLED_SURFACE;
-		break;
-	case 0x40:
-	case 0x60:
-		class = NV40_SWIZZLED_SURFACE;
-		break;
-	default:
-		/* Famous last words: this really can't happen.. */
-		assert(0);
-		break;
-	}
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	switch (chan->device->chipset & 0xf0) {
-	case 0x10:
-	case 0x20:
-		class = NV10_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	case 0x30:
-		class = NV30_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	case 0x40:
-	case 0x60:
-		class = NV40_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	default:
-		class = NV04_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	}
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	ctx->copy = nv04_surface_copy;
-	ctx->fill = nv04_surface_fill;
-	return ctx;
-}
-
-struct nv04_surface*
-nv04_surface_wrap_for_render(struct pipe_screen *pscreen,
-			     struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
-{
-	struct pipe_resource templ;
-	struct pipe_resource* temp_tex;
-	struct nv04_surface* temp_ns;
-	int temp_flags;
-
-	temp_flags = ns->base.usage;
-
-	ns->base.usage = 0;
-
-	memset(&templ, 0, sizeof(templ));
-	templ.format = ns->base.texture->format;
-	templ.target = PIPE_TEXTURE_2D;
-	templ.width0 = ns->base.width;
-	templ.height0 = ns->base.height;
-	templ.depth0 = 1;
-	templ.last_level = 0;
-
-	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
-	templ.nr_samples = ns->base.texture->nr_samples;
-
-	templ.bind = ns->base.texture->bind | PIPE_BIND_RENDER_TARGET;
-
-	temp_tex = pscreen->resource_create(pscreen, &templ);
-	temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
-	temp_ns->backing = ns;
-
-	if(1) /* hmm */
-		eng2d->copy(eng2d, &temp_ns->backing->base,
-			    0, 0, &ns->base,
-			    0, 0, ns->base.width, ns->base.height);
-
-	return temp_ns;
-}
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.h b/src/gallium/drivers/nvfx/nv04_surface_2d.h
deleted file mode 100644
index 2123c3ed08..0000000000
--- a/src/gallium/drivers/nvfx/nv04_surface_2d.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef __NV04_SURFACE_2D_H__
-#define __NV04_SURFACE_2D_H__
-
-#include "pipe/p_state.h"
-
-struct nouveau_screen;
-
-struct nv04_surface {
-	struct pipe_surface base;
-	unsigned pitch;
-	struct nv04_surface* backing;
-};
-
-struct nv04_surface_2d {
-	struct nouveau_notifier *ntfy;
-	struct nouveau_grobj *surf2d;
-	struct nouveau_grobj *swzsurf;
-	struct nouveau_grobj *m2mf;
-	struct nouveau_grobj *rect;
-	struct nouveau_grobj *blit;
-	struct nouveau_grobj *sifm;
-
-	struct nouveau_bo *(*buf)(struct pipe_surface *);
-
-	void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst,
-		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
-		     int w, int h);
-	void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst,
-		     int dx, int dy, int w, int h, unsigned value);
-};
-
-struct nv04_surface_2d *
-nv04_surface_2d_init(struct nouveau_screen *screen);
-
-void
-nv04_surface_2d_takedown(struct nv04_surface_2d **);
-
-struct nv04_surface*
-nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
-
-#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
-
-#endif
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
index dec073ac90..0c3d43fd57 100644
--- a/src/gallium/drivers/nvfx/nv30_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -1,7 +1,6 @@
 #include "util/u_format.h"
 
 #include "nvfx_context.h"
-#include "nouveau/nouveau_util.h"
 #include "nvfx_tex.h"
 #include "nvfx_resource.h"
 
@@ -10,138 +9,109 @@ nv30_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso)
 {
-	if (cso->max_anisotropy >= 8) {
-		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
-	} else
-	if (cso->max_anisotropy >= 4) {
-		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
-	} else
-	if (cso->max_anisotropy >= 2) {
-		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
-	}
+	float limit;
 
+	if (cso->max_anisotropy >= 2)
 	{
-		float limit;
+		if (cso->max_anisotropy >= 8)
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+		else if (cso->max_anisotropy >= 4)
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+		else if (cso->max_anisotropy >= 2)
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
 
-		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
-		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+	limit = CLAMP(cso->lod_bias, -16.0, 15.0 + (255.0 / 256.0));
+	ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
 
-		limit = CLAMP(cso->max_lod, 0.0, 15.0);
-		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+	ps->max_lod = (int)CLAMP(cso->max_lod, 0.0, 15.0);
+	ps->min_lod = (int)CLAMP(cso->min_lod, 0.0, 15.0);
 
-		limit = CLAMP(cso->min_lod, 0.0, 15.0);
-		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
-	}
+	ps->en |= NV34TCL_TX_ENABLE_ENABLE;
 }
 
-#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
-{                                                                              \
-  TRUE,                                                                        \
-  PIPE_FORMAT_##m,                                                             \
-  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
-  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
-   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
-   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
-   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
-}
-
-struct nv30_texture_format {
-	boolean defined;
-	uint	pipe;
-	int     format;
-	int     swizzle;
-};
-
-static struct nv30_texture_format
-nv30_texture_formats[] = {
-	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W),
-	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
-	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
-	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
-	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
-	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
-	_(Z16_UNORM     , R5G6B5  ,   S1,   S1,   S1,  ONE, X, X, X, X),
-	_(S8_USCALED_Z24_UNORM   , A8R8G8B8,   S1,   S1,   S1,  ONE, X, X, X, X),
-	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
-	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	{},
-};
-
-static struct nv30_texture_format *
-nv30_fragtex_format(uint pipe_format)
+void
+nv30_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv)
 {
-	struct nv30_texture_format *tf = nv30_texture_formats;
-
-	while (tf->defined) {
-		if (tf->pipe == pipe_format)
-			return tf;
-		tf++;
-	}
-
-	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
-	return NULL;
+	struct pipe_resource* pt = sv->base.texture;
+	struct nvfx_texture_format *tf = &nvfx_texture_formats[sv->base.format];
+	unsigned txf;
+	unsigned level = pt->target == PIPE_TEXTURE_CUBE ? 0 : sv->base.first_level;
+
+	assert(tf->fmt[0] >= 0);
+
+	txf = sv->u.init_fmt;
+	txf |= (level != sv->base.last_level ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= util_logbase2(u_minify(pt->width0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= util_logbase2(u_minify(pt->height0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= util_logbase2(u_minify(pt->depth0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |=  0x10000;
+
+	sv->u.nv30.fmt[0] = tf->fmt[0] | txf;
+	sv->u.nv30.fmt[1] = tf->fmt[1] | txf;
+	sv->u.nv30.fmt[2] = tf->fmt[2] | txf;
+	sv->u.nv30.fmt[3] = tf->fmt[3] | txf;
+
+	sv->swizzle  |= (nvfx_subresource_pitch(pt, 0) << NV34TCL_TX_SWIZZLE_RECT_PITCH_SHIFT);
+
+	if(pt->height0 <= 1 || util_format_is_compressed(sv->base.format))
+		sv->u.nv30.rect = -1;
+	else
+		sv->u.nv30.rect = !!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR);
+
+	sv->lod_offset = sv->base.first_level - level;
+	sv->max_lod_limit = sv->base.last_level - level;
 }
 
-
 void
 nv30_fragtex_set(struct nvfx_context *nvfx, int unit)
 {
 	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
-	struct nvfx_miptree *nv30mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
-	struct pipe_resource *pt = &nv30mt->base.base;
-	struct nouveau_bo *bo = nv30mt->base.bo;
-	struct nv30_texture_format *tf;
+	struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit];
+	struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo;
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	uint32_t txf, txs;
+	unsigned txf;
 	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	unsigned use_rect;
+	unsigned max_lod = MIN2(ps->max_lod + sv->lod_offset, sv->max_lod_limit);
+	unsigned min_lod = MIN2(ps->min_lod + sv->lod_offset, max_lod) ;
 
-	tf = nv30_fragtex_format(pt->format);
-	if (!tf)
-		return;
-
-	txf  = tf->format;
-	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
-	txf |= log2i(pt->width0) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
-	txf |= log2i(pt->height0) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
-	txf |= log2i(pt->depth0) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
-	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
-
-	switch (pt->target) {
-	case PIPE_TEXTURE_CUBE:
-		txf |= NV34TCL_TX_FORMAT_CUBIC;
-		/* fall-through */
-	case PIPE_TEXTURE_2D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
-		break;
-	case PIPE_TEXTURE_3D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
-		break;
-	case PIPE_TEXTURE_1D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown target %d\n", pt->target);
-		return;
+	if(sv->u.nv30.rect < 0)
+	{
+		/* in the case of compressed or 1D textures, we can get away with this,
+		 * since the layout is the same
+		 */
+		use_rect = ps->fmt;
+	}
+	else
+	{
+		static boolean warned = FALSE;
+		if( !!ps->fmt != sv->u.nv30.rect && !warned) {
+			warned = TRUE;
+			fprintf(stderr,
+					"Unimplemented: coordinate normalization mismatch. Possible reasons:\n"
+					"1. ARB_texture_non_power_of_two is being used despite the fact it isn't supported\n"
+					"2. The state tracker is not using the appropriate coordinate normalization\n"
+					"3. The state tracker is not supported\n");
+		}
+
+		use_rect  = sv->u.nv30.rect;
 	}
 
-	txs = tf->swizzle;
+	txf = sv->u.nv30.fmt[ps->compare + (use_rect ? 2 : 0)];
 
 	MARK_RING(chan, 9, 2);
 	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
-	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
-	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
-		      NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
-	OUT_RING(chan, ps->wrap);
-	OUT_RING(chan, NV34TCL_TX_ENABLE_ENABLE | ps->en);
-	OUT_RING(chan, txs);
-	OUT_RING(chan, ps->filt | 0x2000 /*voodoo*/);
-	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
-		       pt->height0);
+	OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, txf,
+		tex_flags | NOUVEAU_BO_OR,
+		NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	OUT_RING(chan, (ps->wrap & sv->wrap_mask) | sv->wrap);
+	OUT_RING(chan, ps->en | (min_lod << NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT) | (max_lod << NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT));
+	OUT_RING(chan, sv->swizzle);
+	OUT_RING(chan, ps->filt | sv->filt);
+	OUT_RING(chan, sv->npot_size);
 	OUT_RING(chan, ps->bcol);
 
 	nvfx->hw_txf[unit] = txf;
diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h
index ec0444c07f..9a68f5c1fb 100644
--- a/src/gallium/drivers/nvfx/nv30_vertprog.h
+++ b/src/gallium/drivers/nvfx/nv30_vertprog.h
@@ -68,7 +68,7 @@
 #define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
 #define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
 #define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
-#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0x1F << 16)
 #define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
 #define NV30_VP_INST_COND_SHIFT          11
 #define NV30_VP_INST_COND_MASK          (0x07 << 11)
@@ -111,7 +111,7 @@
 #define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
 #define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
 #define NV30_VP_INST_IADDR_SHIFT        2
-#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+#define NV30_VP_INST_IADDR_MASK          (0x1FF <<  2)   /* NV30_VP_SRC2_LOW_MASK << 28 */
 
 /* DWORD 3 */
 #define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
@@ -125,7 +125,7 @@
 #define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
 #define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
 #define NV30_VP_INST_DEST_SHIFT        2
-#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#define NV30_VP_INST_DEST_MASK        (0x1F <<  2)
 #  define NV30_VP_INST_DEST_POS  0
 #  define NV30_VP_INST_DEST_BFC0  1
 #  define NV30_VP_INST_DEST_BFC1  2
@@ -133,7 +133,8 @@
 #  define NV30_VP_INST_DEST_COL1  4
 #  define NV30_VP_INST_DEST_FOGC  5
 #  define NV30_VP_INST_DEST_PSZ   6
-#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+#  define NV30_VP_INST_DEST_TC(n)  (8+(n))
+#  define NV30_VP_INST_DEST_CLP(n) (17 + (n))
 
 /* Useful to split the source selection regs into their pieces */
 #define NV30_VP_SRC0_HIGH_SHIFT                                                6
diff --git a/src/gallium/drivers/nvfx/nv40_fragtex.c b/src/gallium/drivers/nvfx/nv40_fragtex.c
index 0068b1ba54..106ce71a07 100644
--- a/src/gallium/drivers/nvfx/nv40_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv40_fragtex.c
@@ -8,168 +8,97 @@ nv40_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso)
 {
+	float limit;
 	if (cso->max_anisotropy >= 2) {
 		/* no idea, binary driver sets it, works without it.. meh.. */
 		ps->wrap |= (1 << 5);
 
-		if (cso->max_anisotropy >= 16) {
+		if (cso->max_anisotropy >= 16)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
-		} else
-		if (cso->max_anisotropy >= 12) {
+		else if (cso->max_anisotropy >= 12)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
-		} else
-		if (cso->max_anisotropy >= 10) {
+		else if (cso->max_anisotropy >= 10)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
-		} else
-		if (cso->max_anisotropy >= 8) {
+		else if (cso->max_anisotropy >= 8)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
-		} else
-		if (cso->max_anisotropy >= 6) {
+		else if (cso->max_anisotropy >= 6)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
-		} else
-		if (cso->max_anisotropy >= 4) {
+		else if (cso->max_anisotropy >= 4)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
-		} else {
+		else
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
-		}
 	}
 
-	{
-		float limit;
+	limit = CLAMP(cso->lod_bias, -16.0, 15.0 + (255.0 / 256.0));
+	ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
 
-		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
-		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+	ps->max_lod = (int)(CLAMP(cso->max_lod, 0.0, 15.0 + (255.0 / 256.0)) * 256.0);
+	ps->min_lod = (int)(CLAMP(cso->min_lod, 0.0, 15.0 + (255.0 / 256.0)) * 256.0);
 
-		limit = CLAMP(cso->max_lod, 0.0, 15.0);
-		ps->en |= (int)(limit * 256.0) << 7;
-
-		limit = CLAMP(cso->min_lod, 0.0, 15.0);
-		ps->en |= (int)(limit * 256.0) << 19;
-	}
-}
-
-#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
-{                                                                              \
-  TRUE,                                                                        \
-  PIPE_FORMAT_##m,                                                             \
-  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
-  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |         \
-   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |         \
-   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |         \
-   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w),         \
-  ((NV34TCL_TX_FILTER_SIGNED_RED*sx) | (NV34TCL_TX_FILTER_SIGNED_GREEN*sy) |       \
-   (NV34TCL_TX_FILTER_SIGNED_BLUE*sz) | (NV34TCL_TX_FILTER_SIGNED_ALPHA*sw))       \
+	ps->en |= NV40TCL_TEX_ENABLE_ENABLE;
 }
 
-struct nv40_texture_format {
-	boolean defined;
-	uint	pipe;
-	int     format;
-	int     swizzle;
-	int     sign;
-};
-
-static struct nv40_texture_format
-nv40_texture_formats[] = {
-	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
-	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
-	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
-	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
-	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
-	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
-	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
-	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
-	_(S8_USCALED_Z24_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
-	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
-	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	{},
-};
-
-static struct nv40_texture_format *
-nv40_fragtex_format(uint pipe_format)
+void
+nv40_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv)
 {
-	struct nv40_texture_format *tf = nv40_texture_formats;
-
-	while (tf->defined) {
-		if (tf->pipe == pipe_format)
-			return tf;
-		tf++;
+	struct pipe_resource* pt = sv->base.texture;
+	struct nvfx_miptree* mt = (struct nvfx_miptree*)pt;
+	struct nvfx_texture_format *tf = &nvfx_texture_formats[sv->base.format];
+	unsigned txf;
+	unsigned level = pt->target == PIPE_TEXTURE_CUBE ? 0 : sv->base.first_level;
+	assert(tf->fmt[4] >= 0);
+
+	txf = sv->u.init_fmt;
+	txf |= 0x8000;
+	if(pt->target == PIPE_TEXTURE_CUBE)
+		txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+	else
+		txf |= (((sv->base.last_level - sv->base.first_level) + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (!mt->linear_pitch)
+		sv->u.nv40.npot_size2 = 0;
+	else {
+		sv->u.nv40.npot_size2  = mt->linear_pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
 	}
 
-	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
-	return NULL;
-}
+	sv->u.nv40.fmt[0] = tf->fmt[4] | txf;
+	sv->u.nv40.fmt[1] = tf->fmt[5] | txf;
 
+	sv->u.nv40.npot_size2 |= (u_minify(pt->depth0, level) << NV40TCL_TEX_SIZE1_DEPTH_SHIFT);
+
+	sv->lod_offset = (sv->base.first_level - level) * 256;
+	sv->max_lod_limit = (sv->base.last_level - level) * 256;
+}
 
 void
 nv40_fragtex_set(struct nvfx_context *nvfx, int unit)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
-	struct nvfx_miptree *nv40mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
-	struct nouveau_bo *bo = nv40mt->base.bo;
-	struct pipe_resource *pt = &nv40mt->base.base;
-	struct nv40_texture_format *tf;
-
-	uint32_t txf, txs, txp;
+	struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit];
+	struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo;
 	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	unsigned txf;
+	unsigned max_lod = MIN2(ps->max_lod + sv->lod_offset, sv->max_lod_limit);
+	unsigned min_lod = MIN2(ps->min_lod + sv->lod_offset, max_lod);
 
-	tf = nv40_fragtex_format(pt->format);
-	if (!tf)
-		assert(0);
-
-	txf  = ps->fmt;
-	txf |= tf->format | 0x8000;
-	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
-
-	if (1) /* XXX */
-		txf |= NV34TCL_TX_FORMAT_NO_BORDER;
-
-	switch (pt->target) {
-	case PIPE_TEXTURE_CUBE:
-		txf |= NV34TCL_TX_FORMAT_CUBIC;
-		/* fall-through */
-	case PIPE_TEXTURE_2D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
-		break;
-	case PIPE_TEXTURE_3D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
-		break;
-	case PIPE_TEXTURE_1D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown target %d\n", pt->target);
-		return;
-	}
-
-	if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
-		txp = 0;
-	} else {
-		txp  = nv40mt->level[0].pitch;
-		txf |= NV40TCL_TEX_FORMAT_LINEAR;
-	}
-
-	txs = tf->swizzle;
+	txf = sv->u.nv40.fmt[ps->compare] | ps->fmt;
 
-	MARK_RING(chan, 11 + 2 * !unit, 2);
+	MARK_RING(chan, 11, 2);
 	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
-	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
 			NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
-	OUT_RING(chan, ps->wrap);
-	OUT_RING(chan, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
-	OUT_RING(chan, txs);
-	OUT_RING(chan, ps->filt | tf->sign | 0x2000 /*voodoo*/);
-	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0);
+	OUT_RING(chan, (ps->wrap & sv->wrap_mask) | sv->wrap);
+	OUT_RING(chan, ps->en | (min_lod << 19) | (max_lod << 7));
+	OUT_RING(chan, sv->swizzle);
+	OUT_RING(chan, ps->filt | sv->filt);
+	OUT_RING(chan, sv->npot_size);
 	OUT_RING(chan, ps->bcol);
 	OUT_RING(chan, RING_3D(NV40TCL_TEX_SIZE1(unit), 1));
-	OUT_RING(chan, (pt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+	OUT_RING(chan, sv->u.nv40.npot_size2);
 
 	nvfx->hw_txf[unit] = txf;
 	nvfx->hw_samplers |= (1 << unit);
diff --git a/src/gallium/drivers/nvfx/nv40_vertprog.h b/src/gallium/drivers/nvfx/nv40_vertprog.h
index 7337293bab..3d0a1fe3d1 100644
--- a/src/gallium/drivers/nvfx/nv40_vertprog.h
+++ b/src/gallium/drivers/nvfx/nv40_vertprog.h
@@ -44,7 +44,7 @@
 #define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
 #define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
 #define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
-#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x3F << 15)
 #define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
 #define NV40_VP_INST_COND_SHIFT                                               10
 #define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
@@ -100,7 +100,7 @@
 #define NV40_VP_INST_SRC2H_SHIFT                                               0
 #define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
 #define NV40_VP_INST_IADDRH_SHIFT                                              0
-#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+#define NV40_VP_INST_IADDRH_MASK                                     (0x3F << 0)
 
 /* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
 #define NV40_VP_INST_IADDRL_SHIFT                                             29
diff --git a/src/gallium/drivers/nvfx/nvfx_buffer.c b/src/gallium/drivers/nvfx/nvfx_buffer.c
index 05b824b8f7..041099e0e5 100644
--- a/src/gallium/drivers/nvfx/nvfx_buffer.c
+++ b/src/gallium/drivers/nvfx/nvfx_buffer.c
@@ -6,115 +6,39 @@
 #include "nouveau/nouveau_screen.h"
 #include "nouveau/nouveau_winsys.h"
 #include "nvfx_resource.h"
+#include "nvfx_screen.h"
 
-
-/* Currently using separate implementations for buffers and textures,
- * even though gallium has a unified abstraction of these objects.
- * Eventually these should be combined, and mechanisms like transfers
- * be adapted to work for both buffer and texture uploads.
- */
-static void nvfx_buffer_destroy(struct pipe_screen *pscreen,
+void nvfx_buffer_destroy(struct pipe_screen *pscreen,
 				struct pipe_resource *presource)
 {
-	struct nvfx_resource *buffer = nvfx_resource(presource);
+	struct nvfx_buffer *buffer = nvfx_buffer(presource);
 
-	nouveau_screen_bo_release(pscreen, buffer->bo);
+	if(!(buffer->base.base.flags & NVFX_RESOURCE_FLAG_USER))
+		align_free(buffer->data);
+	nouveau_screen_bo_release(pscreen, buffer->base.bo);
 	FREE(buffer);
 }
 
-
-
-
-/* Utility functions for transfer create/destroy are hooked in and
- * just record the arguments to those functions.
- */
-static void *
-nvfx_buffer_transfer_map( struct pipe_context *pipe,
-			  struct pipe_transfer *transfer )
-{
-	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
-	uint8_t *map;
-
-	map = nouveau_screen_bo_map_range( pipe->screen,
-					   buffer->bo,
-					   transfer->box.x,
-					   transfer->box.width,
-					   nouveau_screen_transfer_flags(transfer->usage) );
-	if (map == NULL)
-		return NULL;
-	
-	return map + transfer->box.x;
-}
-
-
-
-static void nvfx_buffer_transfer_flush_region( struct pipe_context *pipe,
-					       struct pipe_transfer *transfer,
-					       const struct pipe_box *box)
-{
-	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
-
-	nouveau_screen_bo_map_flush_range(pipe->screen,
-					  buffer->bo,
-					  transfer->box.x + box->x,
-					  box->width);
-}
-
-static void nvfx_buffer_transfer_unmap( struct pipe_context *pipe,
-					struct pipe_transfer *transfer )
-{
-	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
-
-	nouveau_screen_bo_unmap(pipe->screen, buffer->bo);
-}
-
-
-
-
-struct u_resource_vtbl nvfx_buffer_vtbl = 
-{
-	u_default_resource_get_handle,      /* get_handle */
-	nvfx_buffer_destroy,		     /* resource_destroy */
-	NULL,			    /* is_resource_referenced */
-	u_default_get_transfer,	     /* get_transfer */
-	u_default_transfer_destroy,	     /* transfer_destroy */
-	nvfx_buffer_transfer_map,	     /* transfer_map */
-	nvfx_buffer_transfer_flush_region,  /* transfer_flush_region */
-	nvfx_buffer_transfer_unmap,	     /* transfer_unmap */
-	u_default_transfer_inline_write   /* transfer_inline_write */
-};
-
-
-
 struct pipe_resource *
 nvfx_buffer_create(struct pipe_screen *pscreen,
 		   const struct pipe_resource *template)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	buffer->base = *template;
-	buffer->vtbl = &nvfx_buffer_vtbl;
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.screen = pscreen;
-
-	buffer->bo = nouveau_screen_bo_new(pscreen,
-					   16,
-					   buffer->base.usage,
-					   buffer->base.bind,
-					   buffer->base.width0);
-
-	if (buffer->bo == NULL)
-		goto fail;
-
-	return &buffer->base;
+	buffer->base.base = *template;
+	buffer->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.screen = pscreen;
+	buffer->size = util_format_get_stride(template->format, template->width0);
+	buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+	buffer->data = align_malloc(buffer->size, 16);
 
-fail:
-	FREE(buffer);
-	return NULL;
+	return &buffer->base.base;
 }
 
 
@@ -124,30 +48,49 @@ nvfx_user_buffer_create(struct pipe_screen *pscreen,
 			unsigned bytes,
 			unsigned usage)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->vtbl = &nvfx_buffer_vtbl;
-	buffer->base.screen = pscreen;
-	buffer->base.format = PIPE_FORMAT_R8_UNORM;
-	buffer->base.usage = PIPE_USAGE_IMMUTABLE;
-	buffer->base.bind = usage;
-	buffer->base.width0 = bytes;
-	buffer->base.height0 = 1;
-	buffer->base.depth0 = 1;
-
-	buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes);
-	if (!buffer->bo)
-		goto fail;
-	
-	return &buffer->base;
-
-fail:
-	FREE(buffer);
-	return NULL;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.flags = NVFX_RESOURCE_FLAG_LINEAR | NVFX_RESOURCE_FLAG_USER;
+	buffer->base.base.screen = pscreen;
+	buffer->base.base.format = PIPE_FORMAT_R8_UNORM;
+	buffer->base.base.usage = PIPE_USAGE_IMMUTABLE;
+	buffer->base.base.bind = usage;
+	buffer->base.base.width0 = bytes;
+	buffer->base.base.height0 = 1;
+	buffer->base.base.depth0 = 1;
+	buffer->data = ptr;
+	buffer->size = bytes;
+	buffer->bytes_to_draw_until_static = bytes * screen->static_reuse_threshold;
+	buffer->dirty_end = bytes;
+
+	return &buffer->base.base;
 }
 
+void nvfx_buffer_upload(struct nvfx_buffer* buffer)
+{
+	unsigned dirty = buffer->dirty_end - buffer->dirty_begin;
+	if(!buffer->base.bo)
+	{
+		buffer->base.bo = nouveau_screen_bo_new(buffer->base.base.screen,
+					   16,
+					   buffer->base.base.usage,
+					   buffer->base.base.bind,
+					   buffer->base.base.width0);
+	}
+
+	if(dirty)
+	{
+		// TODO: may want to use a temporary in some cases
+		nouveau_bo_map(buffer->base.bo, NOUVEAU_BO_WR
+				| (buffer->dirty_unsynchronized ? NOUVEAU_BO_NOSYNC : 0));
+		memcpy((uint8_t*)buffer->base.bo->map + buffer->dirty_begin, buffer->data + buffer->dirty_begin, dirty);
+		nouveau_bo_unmap(buffer->base.bo);
+		buffer->dirty_begin = buffer->dirty_end = 0;
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 7218abff22..5a2fa14c88 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -1,5 +1,6 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
+#include "util/u_framebuffer.h"
 
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
@@ -14,6 +15,7 @@ nvfx_flush(struct pipe_context *pipe, unsigned flags,
 	struct nouveau_channel *chan = screen->base.channel;
 	struct nouveau_grobj *eng3d = screen->eng3d;
 
+	/* XXX: we need to actually be intelligent here */
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
 		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
 		OUT_RING  (chan, 2);
@@ -31,8 +33,22 @@ nvfx_destroy(struct pipe_context *pipe)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
+	if(nvfx->dummy_fs)
+		pipe->delete_fs_state(pipe, nvfx->dummy_fs);
+
+	for(unsigned i = 0; i < nvfx->vtxbuf_nr; ++i)
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
+	pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
+	util_unreference_framebuffer_state(&nvfx->framebuffer);
+	for(unsigned i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[i], 0);
+
 	if (nvfx->draw)
 		draw_destroy(nvfx->draw);
+
+	if(nvfx->screen->cur_ctx == nvfx)
+		nvfx->screen->cur_ctx = NULL;
+
 	FREE(nvfx);
 }
 
@@ -59,14 +75,21 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 	nvfx->pipe.clear = nvfx_clear;
 	nvfx->pipe.flush = nvfx_flush;
 
-	screen->base.channel->user_private = nvfx;
-
 	nvfx->is_nv4x = screen->is_nv4x;
+	/* TODO: it seems that nv30 might have fixed function clipping usable with vertex programs
+	 * However, my code for that doesn't work, so use vp clipping for all cards, which works.
+	 */
+	nvfx->use_vp_clipping = TRUE;
 
 	nvfx_init_query_functions(nvfx);
 	nvfx_init_surface_functions(nvfx);
 	nvfx_init_state_functions(nvfx);
+	nvfx_init_sampling_functions(nvfx);
+	nvfx_init_vbo_functions(nvfx);
+	nvfx_init_fragprog_functions(nvfx);
+	nvfx_init_vertprog_functions(nvfx);
 	nvfx_init_resource_functions(&nvfx->pipe);
+	nvfx_init_transfer_functions(&nvfx->pipe);
 
 	/* Create, configure, and install fallback swtnl path */
 	nvfx->draw = draw_create(&nvfx->pipe);
@@ -78,6 +101,12 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 
 	/* set these to that we init them on first validation */
 	nvfx->state.scissor_enabled = ~0;
-	nvfx->state.stipple_enabled = ~0;
+	nvfx->hw_pointsprite_control = -1;
+	nvfx->hw_vp_output = -1;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->relocs_needed = NVFX_RELOCATE_ALL;
+
+	LIST_INITHEAD(&nvfx->render_cache);
+
 	return &nvfx->pipe;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index 89f94c10bd..4c654bfa8b 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -11,8 +11,10 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_inlines.h"
+#include "util/u_double_list.h"
 
 #include "draw/draw_vertex.h"
+#include "util/u_blitter.h"
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
@@ -42,17 +44,26 @@
 #define NVFX_NEW_SR		(1 << 13)
 #define NVFX_NEW_VERTCONST	(1 << 14)
 #define NVFX_NEW_FRAGCONST	(1 << 15)
+#define NVFX_NEW_INDEX	(1 << 16)
+#define NVFX_NEW_SPRITE  (1 << 17)
+
+#define NVFX_RELOCATE_FRAMEBUFFER (1 << 0)
+#define NVFX_RELOCATE_FRAGTEX (1 << 1)
+#define NVFX_RELOCATE_FRAGPROG (1 << 2)
+#define NVFX_RELOCATE_VTXBUF (1 << 3)
+#define NVFX_RELOCATE_IDXBUF (1 << 4)
+#define NVFX_RELOCATE_ALL 0x1f
 
 struct nvfx_rasterizer_state {
 	struct pipe_rasterizer_state pipe;
 	unsigned sb_len;
-	uint32_t sb[32];
+	uint32_t sb[34];
 };
 
 struct nvfx_zsa_state {
 	struct pipe_depth_stencil_alpha_state pipe;
 	unsigned sb_len;
-	uint32_t sb[26];
+	uint32_t sb[24];
 };
 
 struct nvfx_blend_state {
@@ -64,13 +75,57 @@ struct nvfx_blend_state {
 
 struct nvfx_state {
 	unsigned scissor_enabled;
-	unsigned stipple_enabled;
 	unsigned fp_samplers;
+	unsigned render_temps;
+};
+
+struct nvfx_per_vertex_element {
+	unsigned idx;
+        unsigned vertex_buffer_index;
+        unsigned src_offset;
+};
+
+struct nvfx_low_frequency_element {
+	unsigned idx;
+	unsigned vertex_buffer_index;
+	unsigned src_offset;
+        void (*fetch_rgba_float)(float *dst, const uint8_t *src, unsigned i, unsigned j);
+        unsigned ncomp;
+};
+
+struct nvfx_per_instance_element {
+	struct nvfx_low_frequency_element base;
+	unsigned instance_divisor;
+};
+
+struct nvfx_per_vertex_buffer_info
+{
+	unsigned vertex_buffer_index;
+	unsigned per_vertex_size;
 };
 
 struct nvfx_vtxelt_state {
 	struct pipe_vertex_element pipe[16];
 	unsigned num_elements;
+	unsigned vtxfmt[16];
+
+	unsigned num_per_vertex_buffer_infos;
+	struct nvfx_per_vertex_buffer_info per_vertex_buffer_info[16];
+
+	unsigned num_per_vertex;
+	struct nvfx_per_vertex_element per_vertex[16];
+
+	unsigned num_per_instance;
+	struct nvfx_per_instance_element per_instance[16];
+
+	unsigned num_constant;
+	struct nvfx_low_frequency_element constant[16];
+
+	boolean needs_translate;
+	struct translate* translate;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
 };
 
 struct nvfx_render_target {
@@ -86,8 +141,11 @@ struct nvfx_context {
 	struct nvfx_screen *screen;
 
 	unsigned is_nv4x; /* either 0 or ~0 */
+	boolean use_vp_clipping;
 
 	struct draw_context *draw;
+	struct blitter_context* blitter;
+	struct list_head render_cache;
 
 	/* HW state derived from pipe states */
 	struct nvfx_state state;
@@ -111,7 +169,7 @@ struct nvfx_context {
 	unsigned stipple[32];
 	struct pipe_clip_state clip;
 	struct nvfx_vertex_program *vertprog;
-	struct nvfx_fragment_program *fragprog;
+	struct nvfx_pipe_fragment_program *fragprog;
 	struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
 	unsigned constbuf_nr[PIPE_SHADER_TYPES];
 	struct nvfx_rasterizer_state *rasterizer;
@@ -122,23 +180,34 @@ struct nvfx_context {
 	struct pipe_viewport_state viewport;
 	struct pipe_framebuffer_state framebuffer;
 	struct pipe_index_buffer idxbuf;
-	struct pipe_resource *idxbuf_buffer;
-	unsigned idxbuf_format;
 	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
 	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+	struct nvfx_pipe_fragment_program* dummy_fs;
+
 	unsigned nr_samplers;
 	unsigned nr_textures;
 	unsigned dirty_samplers;
 	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
 	unsigned vtxbuf_nr;
 	struct nvfx_vtxelt_state *vtxelt;
+	int base_vertex;
+	boolean use_index_buffer;
+	/* -1 = hardware input setup is outdated
+	 * 0 = hardware input setup is for inline vertices
+	 * 1 = hardware input setup is for hardware vertices
+	 */
+	int use_vertex_buffers;
 
-	unsigned vbo_bo;
 	unsigned hw_vtxelt_nr;
 	uint8_t hw_samplers;
 	uint32_t hw_txf[8];
 	struct nvfx_render_target hw_rt[4];
 	struct nvfx_render_target hw_zeta;
+	int hw_pointsprite_control;
+	int hw_vp_output;
+	struct nvfx_fragment_program* hw_fragprog;
+
+	unsigned relocs_needed;
 };
 
 static INLINE struct nvfx_context *
@@ -175,15 +244,12 @@ extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers,
 
 /* nvfx_draw.c */
 extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx);
-extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-                                     struct pipe_resource *idxbuf,
-                                     unsigned ib_size, int ib_bias,
-                                     unsigned mode,
-                                     unsigned start, unsigned count);
+extern void nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info);
 extern void nvfx_vtxfmt_validate(struct nvfx_context *nvfx);
 
 /* nvfx_fb.c */
-extern void nvfx_state_framebuffer_validate(struct nvfx_context *nvfx);
+extern int nvfx_framebuffer_prepare(struct nvfx_context *nvfx);
+extern void nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result);
 void
 nvfx_framebuffer_relocate(struct nvfx_context *nvfx);
 
@@ -191,19 +257,24 @@ nvfx_framebuffer_relocate(struct nvfx_context *nvfx);
 extern void nvfx_fragprog_destroy(struct nvfx_context *,
 				    struct nvfx_fragment_program *);
 extern void nvfx_fragprog_validate(struct nvfx_context *nvfx);
-extern void
-nvfx_fragprog_relocate(struct nvfx_context *nvfx);
+extern void nvfx_fragprog_relocate(struct nvfx_context *nvfx);
+extern void nvfx_init_fragprog_functions(struct nvfx_context *nvfx);
 
 /* nvfx_fragtex.c */
+extern void nvfx_init_sampling_functions(struct nvfx_context *nvfx);
 extern void nvfx_fragtex_validate(struct nvfx_context *nvfx);
-extern void
-nvfx_fragtex_relocate(struct nvfx_context *nvfx);
+extern void nvfx_fragtex_relocate(struct nvfx_context *nvfx);
+
+struct nvfx_sampler_view;
 
 /* nv30_fragtex.c */
 extern void
 nv30_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso);
+extern void
+nv30_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv);
 extern void nv30_fragtex_set(struct nvfx_context *nvfx, int unit);
 
 /* nv40_fragtex.c */
@@ -211,6 +282,9 @@ extern void
 nv40_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso);
+extern void
+nv40_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv);
 extern void nv40_fragtex_set(struct nvfx_context *nvfx, int unit);
 
 /* nvfx_state.c */
@@ -225,23 +299,75 @@ extern void nvfx_state_sr_validate(struct nvfx_context *nvfx);
 extern void nvfx_state_zsa_validate(struct nvfx_context *nvfx);
 
 /* nvfx_state_emit.c */
-extern void nvfx_state_relocate(struct nvfx_context *nvfx);
+extern void nvfx_state_relocate(struct nvfx_context *nvfx, unsigned relocs);
 extern boolean nvfx_state_validate(struct nvfx_context *nvfx);
 extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx);
-extern void nvfx_state_emit(struct nvfx_context *nvfx);
+
+static inline void
+nvfx_state_emit(struct nvfx_context *nvfx)
+{
+        unsigned relocs = NVFX_RELOCATE_FRAMEBUFFER | NVFX_RELOCATE_FRAGTEX | NVFX_RELOCATE_FRAGPROG;
+        if (nvfx->render_mode == HW)
+        {
+                relocs |= NVFX_RELOCATE_VTXBUF;
+                if(nvfx->use_index_buffer)
+                        relocs |= NVFX_RELOCATE_IDXBUF;
+        }
+
+        relocs &= nvfx->relocs_needed;
+        if(relocs)
+                nvfx_state_relocate(nvfx, relocs);
+}
 
 /* nvfx_transfer.c */
-extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx);
+extern void nvfx_init_transfer_functions(struct pipe_context *pipe);
 
 /* nvfx_vbo.c */
 extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx);
 extern void nvfx_vbo_relocate(struct nvfx_context *nvfx);
+extern void nvfx_idxbuf_validate(struct nvfx_context* nvfx);
+extern void nvfx_idxbuf_relocate(struct nvfx_context* nvfx);
 extern void nvfx_draw_vbo(struct pipe_context *pipe,
                           const struct pipe_draw_info *info);
+extern void nvfx_init_vbo_functions(struct nvfx_context *nvfx);
+extern unsigned nvfx_vertex_formats[];
 
 /* nvfx_vertprog.c */
 extern boolean nvfx_vertprog_validate(struct nvfx_context *nvfx);
 extern void nvfx_vertprog_destroy(struct nvfx_context *,
 				  struct nvfx_vertex_program *);
+extern void nvfx_init_vertprog_functions(struct nvfx_context *nvfx);
+
+/* nvfx_push.c */
+extern void nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
+
+/* must WAIT_RING(chan, ncomp + 1) or equivalent beforehand! */
+static inline void nvfx_emit_vtx_attr(struct nouveau_channel* chan, unsigned attrib, const float* v, unsigned ncomp)
+{
+	switch (ncomp) {
+	case 4:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
+		OUT_RING(chan, fui(v[0]));
+		OUT_RING(chan, fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		OUT_RING(chan,  fui(v[3]));
+		break;
+	case 3:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		break;
+	case 2:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		break;
+	case 1:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
+		OUT_RING(chan,  fui(v[0]));
+		break;
+	}
+}
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
index 22cff370b7..2601d5b8e2 100644
--- a/src/gallium/drivers/nvfx/nvfx_draw.c
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -9,6 +9,7 @@
 #include "draw/draw_pipe.h"
 
 #include "nvfx_context.h"
+#include "nvfx_resource.h"
 
 /* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
  * often at all.  Uses "quadro style" vertex submission + a fixed vertex
@@ -39,30 +40,21 @@ nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
 		unsigned idx = nvfx->swtnl.draw[i];
 		unsigned hw = nvfx->swtnl.hw[i];
 
+		WAIT_RING(chan, 5);
 		switch (nvfx->swtnl.emit[i]) {
 		case EMIT_OMIT:
 			break;
 		case EMIT_1F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1);
-			OUT_RING  (chan, fui(v->data[idx][0]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 1);
 			break;
 		case EMIT_2F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 2);
 			break;
 		case EMIT_3F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 3);
 			break;
 		case EMIT_4F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
-			OUT_RING  (chan, fui(v->data[idx][3]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 4);
 			break;
 		case 0xff:
 			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
@@ -231,15 +223,9 @@ nvfx_draw_render_stage(struct nvfx_context *nvfx)
 }
 
 void
-nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-			 struct pipe_resource *idxbuf,
-			 unsigned idxbuf_size, int idxbuf_bias,
-			 unsigned mode, unsigned start, unsigned count)
+nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
-	struct pipe_transfer *ib_transfer = NULL;
-	struct pipe_transfer *cb_transfer = NULL;
 	unsigned i;
 	void *map;
 
@@ -247,47 +233,28 @@ nvfx_draw_elements_swtnl(struct pipe_context *pipe,
 		return;
 	nvfx_state_emit(nvfx);
 
+	/* these must be passed without adding the offsets */
 	for (i = 0; i < nvfx->vtxbuf_nr; i++) {
-		map = pipe_buffer_map(pipe, nvfx->vtxbuf[i].buffer,
-                                      PIPE_TRANSFER_READ,
-				      &vb_transfer[i]);
+		map = nvfx_buffer(nvfx->vtxbuf[i].buffer)->data;
 		draw_set_mapped_vertex_buffer(nvfx->draw, i, map);
 	}
 
-	if (idxbuf) {
-		map = pipe_buffer_map(pipe, idxbuf,
-				      PIPE_TRANSFER_READ,
-				      &ib_transfer);
-		draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, idxbuf_bias, map);
-	} else {
-		draw_set_mapped_element_buffer(nvfx->draw, 0, 0, NULL);
-	}
+	map = NULL;
+	if (info->indexed && nvfx->idxbuf.buffer)
+		map = nvfx_buffer(nvfx->idxbuf.buffer)->data;
+	draw_set_mapped_index_buffer(nvfx->draw, map);
 
 	if (nvfx->constbuf[PIPE_SHADER_VERTEX]) {
 		const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX];
 
-		map = pipe_buffer_map(pipe,
-				      nvfx->constbuf[PIPE_SHADER_VERTEX],
-				      PIPE_TRANSFER_READ,
-				      &cb_transfer);
+		map = nvfx_buffer(nvfx->constbuf[PIPE_SHADER_VERTEX])->data;
 		draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0,
                                                 map, nr);
 	}
 
-	draw_arrays(nvfx->draw, mode, start, count);
-
-	for (i = 0; i < nvfx->vtxbuf_nr; i++)
-		pipe_buffer_unmap(pipe, nvfx->vtxbuf[i].buffer, vb_transfer[i]);
-
-	if (idxbuf)
-		pipe_buffer_unmap(pipe, idxbuf, ib_transfer);
-
-	if (nvfx->constbuf[PIPE_SHADER_VERTEX])
-		pipe_buffer_unmap(pipe, nvfx->constbuf[PIPE_SHADER_VERTEX],
-				  cb_transfer);
+	draw_vbo(nvfx->draw, info);
 
 	draw_flush(nvfx->draw);
-	pipe->flush(pipe, 0, NULL);
 }
 
 static INLINE void
@@ -305,19 +272,19 @@ emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit,
 void
 nvfx_vtxfmt_validate(struct nvfx_context *nvfx)
 {
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
 	unsigned colour = 0, texcoords = 0, fog = 0, i;
 
 	/* Determine needed fragprog inputs */
-	for (i = 0; i < fp->info.num_inputs; i++) {
-		switch (fp->info.input_semantic_name[i]) {
+	for (i = 0; i < pfp->info.num_inputs; i++) {
+		switch (pfp->info.input_semantic_name[i]) {
 		case TGSI_SEMANTIC_POSITION:
 			break;
 		case TGSI_SEMANTIC_COLOR:
-			colour |= (1 << fp->info.input_semantic_index[i]);
+			colour |= (1 << pfp->info.input_semantic_index[i]);
 			break;
 		case TGSI_SEMANTIC_GENERIC:
-			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			texcoords |= (1 << pfp->info.input_semantic_index[i]);
 			break;
 		case TGSI_SEMANTIC_FOG:
 			fog = 1;
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index ee41f03b9b..275672a31f 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -2,25 +2,31 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
+#include "util/u_debug.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_ureg.h"
 
 #include "nvfx_context.h"
 #include "nvfx_shader.h"
+#include "nvfx_resource.h"
 
 #define MAX_CONSTS 128
 #define MAX_IMM 32
+
 struct nvfx_fpc {
+	struct nvfx_pipe_fragment_program* pfp;
 	struct nvfx_fragment_program *fp;
 
-	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
-
-	unsigned r_temps;
-	unsigned r_temps_discard;
-	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
-	struct nvfx_sreg *r_temp;
+	unsigned max_temps;
+	unsigned long long r_temps;
+	unsigned long long r_temps_discard;
+	struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_reg *r_temp;
+	unsigned sprite_coord_temp;
 
 	int num_regs;
 
@@ -33,34 +39,40 @@ struct nvfx_fpc {
 	} consts[MAX_CONSTS];
 	int nr_consts;
 
-	struct nvfx_sreg imm[MAX_IMM];
+	struct nvfx_reg imm[MAX_IMM];
 	unsigned nr_imm;
+
+	unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */
+
+	struct util_dynarray if_stack;
+	//struct util_dynarray loop_stack;
+	struct util_dynarray label_relocs;
 };
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 temp(struct nvfx_fpc *fpc)
 {
-	int idx = ffs(~fpc->r_temps) - 1;
+	int idx = __builtin_ctzll(~fpc->r_temps);
 
-	if (idx < 0) {
+	if (idx >= fpc->max_temps) {
 		NOUVEAU_ERR("out of temps!!\n");
 		assert(0);
-		return nvfx_sr(NVFXSR_TEMP, 0);
+		return nvfx_reg(NVFXSR_TEMP, 0);
 	}
 
-	fpc->r_temps |= (1 << idx);
-	fpc->r_temps_discard |= (1 << idx);
-	return nvfx_sr(NVFXSR_TEMP, idx);
+	fpc->r_temps |= (1ULL << idx);
+	fpc->r_temps_discard |= (1ULL << idx);
+	return nvfx_reg(NVFXSR_TEMP, idx);
 }
 
 static INLINE void
 release_temps(struct nvfx_fpc *fpc)
 {
 	fpc->r_temps &= ~fpc->r_temps_discard;
-	fpc->r_temps_discard = 0;
+	fpc->r_temps_discard = 0ULL;
 }
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
 {
 	int idx;
@@ -72,16 +84,9 @@ constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
 	fpc->consts[idx].pipe = pipe;
 	if (pipe == -1)
 		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
-	return nvfx_sr(NVFXSR_CONST, idx);
+	return nvfx_reg(NVFXSR_CONST, idx);
 }
 
-#define arith(cc,s,o,d,m,s0,s1,s2) \
-	nvfx_fp_arith((cc), (s), NVFX_FP_OP_OPCODE_##o, \
-			(d), (m), (s0), (s1), (s2))
-#define tex(cc,s,o,u,d,m,s0,s1,s2) \
-	nvfx_fp_tex((cc), (s), NVFX_FP_OP_OPCODE_##o, (u), \
-		    (d), (m), (s0), none, none)
-
 static void
 grow_insns(struct nvfx_fpc *fpc, int size)
 {
@@ -92,23 +97,29 @@ grow_insns(struct nvfx_fpc *fpc, int size)
 }
 
 static void
-emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
+emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
 {
 	struct nvfx_fragment_program *fp = fpc->fp;
 	uint32_t *hw = &fp->insn[fpc->inst_offset];
 	uint32_t sr = 0;
 
-	switch (src.type) {
+	switch (src.reg.type) {
 	case NVFXSR_INPUT:
 		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
-		hw[0] |= (src.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
+		hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
 		break;
 	case NVFXSR_OUTPUT:
 		sr |= NVFX_FP_REG_SRC_HALF;
 		/* fall-through */
 	case NVFXSR_TEMP:
 		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
-		sr |= (src.index << NVFX_FP_REG_SRC_SHIFT);
+		sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
+		break;
+	case NVFXSR_RELOCATED:
+		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
+		sr |= (fpc->sprite_coord_temp << NVFX_FP_REG_SRC_SHIFT);
+		//printf("adding relocation at %x for %x\n", fpc->inst_offset, src.index);
+		util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset + pos + 1);
 		break;
 	case NVFXSR_CONST:
 		if (!fpc->have_const) {
@@ -117,18 +128,18 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
 		}
 
 		hw = &fp->insn[fpc->inst_offset];
-		if (fpc->consts[src.index].pipe >= 0) {
+		if (fpc->consts[src.reg.index].pipe >= 0) {
 			struct nvfx_fragment_program_data *fpd;
 
 			fp->consts = realloc(fp->consts, ++fp->nr_consts *
 					     sizeof(*fpd));
 			fpd = &fp->consts[fp->nr_consts - 1];
 			fpd->offset = fpc->inst_offset + 4;
-			fpd->index = fpc->consts[src.index].pipe;
+			fpd->index = fpc->consts[src.reg.index].pipe;
 			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
 		} else {
 			memcpy(&fp->insn[fpc->inst_offset + 4],
-				fpc->consts[src.index].vals,
+				fpc->consts[src.reg.index].vals,
 				sizeof(uint32_t) * 4);
 		}
 
@@ -156,7 +167,7 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
 }
 
 static void
-emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst)
+emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst)
 {
 	struct nvfx_fragment_program *fp = fpc->fp;
 	uint32_t *hw = &fp->insn[fpc->inst_offset];
@@ -184,9 +195,7 @@ emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst)
 }
 
 static void
-nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op,
-	      struct nvfx_sreg dst, int mask,
-	      struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
 {
 	struct nvfx_fragment_program *fp = fpc->fp;
 	uint32_t *hw;
@@ -197,68 +206,225 @@ nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op,
 	hw = &fp->insn[fpc->inst_offset];
 	memset(hw, 0, sizeof(uint32_t) * 4);
 
-	if (op == NVFX_FP_OP_OPCODE_KIL)
+	if (insn.op == NVFX_FP_OP_OPCODE_KIL)
 		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
-	hw[0] |= (op << NVFX_FP_OP_OPCODE_SHIFT);
-	hw[0] |= (mask << NVFX_FP_OP_OUTMASK_SHIFT);
-	hw[2] |= (dst.dst_scale << NVFX_FP_OP_DST_SCALE_SHIFT);
+	hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT);
 
-	if (sat)
+	if (insn.sat)
 		hw[0] |= NVFX_FP_OP_OUT_SAT;
 
-	if (dst.cc_update)
+	if (insn.cc_update)
 		hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
-	hw[1] |= (dst.cc_test << NVFX_FP_OP_COND_SHIFT);
-	hw[1] |= ((dst.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
-		  (dst.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
-		  (dst.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
-		  (dst.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
-
-	emit_dst(fpc, dst);
-	emit_src(fpc, 0, s0);
-	emit_src(fpc, 1, s1);
-	emit_src(fpc, 2, s2);
+	hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT);
+	hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+		  (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
+
+	if(insn.unit >= 0)
+	{
+		hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
+		fp->samplers |= (1 << insn.unit);
+	}
+
+	emit_dst(fpc, insn.dst);
+	emit_src(fpc, 0, insn.src[0]);
+	emit_src(fpc, 1, insn.src[1]);
+	emit_src(fpc, 2, insn.src[2]);
 }
 
+#define arith(s,o,d,m,s0,s1,s2) \
+       nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
+                       (d), (m), (s0), (s1), (s2))
+
+#define tex(s,o,u,d,m,s0,s1,s2) \
+	nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
+                   (d), (m), (s0), none, none)
+
+/* IF src.x != 0, as TGSI specifies */
 static void
-nvfx_fp_tex(struct nvfx_fpc *fpc, int sat, int op, int unit,
-	    struct nvfx_sreg dst, int mask,
-	    struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
 {
-	struct nvfx_fragment_program *fp = fpc->fp;
+	const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
+	uint32_t *hw;
+	insn.cc_update = 1;
+	nvfx_fp_emit(fpc, insn);
 
-	nvfx_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+		NV40_FP_OP_OUT_NONE |
+		(NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
+			(NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
+	hw[3] = 0; /* | endif_offset */
+	util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
+}
+
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+        hw[3] = 0;
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+}
+
+static void
+nv40_fp_ret(struct nvfx_fpc *fpc)
+{
+	uint32_t *hw;
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+			(NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+	hw[3] = 0;
+}
 
-	fp->insn[fpc->inst_offset] |= (unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
-	fp->samplers |= (1 << unit);
+static void
+nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
+                        NV40_FP_OP_OUT_NONE |
+                        (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
+                        (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT3_SHIFT);
+        hw[3] = 0; /* | end_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+        //util_dynarray_append(&fpc->loop_stack, unsigned, target);
 }
 
-static INLINE struct nvfx_sreg
+/* warning: this only works forward, and probably only if not inside any IF */
+static void
+nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+                NV40_FP_OP_OUT_NONE |
+                (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+                        (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
+        hw[3] = 0; /* | endif_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+}
+
+static void
+nv40_fp_brk(struct nvfx_fpc *fpc)
+{
+	uint32_t *hw;
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
+		NV40_FP_OP_OUT_NONE;
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+			(NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
+	hw[3] = 0;
+}
+
+static INLINE struct nvfx_src
 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
-	struct nvfx_sreg src = { 0 };
+	struct nvfx_src src;
 
 	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		src = nvfx_sr(NVFXSR_INPUT,
-			      fpc->attrib_map[fsrc->Register.Index]);
+		if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_POSITION) {
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
+			src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_POSITION);
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_COLOR) {
+			if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0)
+				src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL0);
+			else if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 1)
+				src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL1);
+			else
+				assert(0);
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG) {
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
+			src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_FOGC);
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FACE) {
+			/* TODO: check this has the correct values */
+			/* XXX: what do we do for nv30 here (assuming it lacks facing)?!  */
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
+			src.reg = nvfx_reg(NVFXSR_INPUT, NV40_FP_OP_INPUT_SRC_FACING);
+		} else {
+			assert(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_GENERIC);
+			src.reg = nvfx_reg(NVFXSR_RELOCATED, fpc->generic_to_slot[fpc->pfp->info.input_semantic_index[fsrc->Register.Index]]);
+		}
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(fpc, fsrc->Register.Index, NULL);
+		src.reg = constant(fpc, fsrc->Register.Index, NULL);
 		break;
 	case TGSI_FILE_IMMEDIATE:
 		assert(fsrc->Register.Index < fpc->nr_imm);
-		src = fpc->imm[fsrc->Register.Index];
+		src.reg = fpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		src = fpc->r_temp[fsrc->Register.Index];
+		src.reg = fpc->r_temp[fsrc->Register.Index];
 		break;
 	/* NV40 fragprog result regs are just temps, so this is simple */
 	case TGSI_FILE_OUTPUT:
-		src = fpc->r_result[fsrc->Register.Index];
+		src.reg = fpc->r_result[fsrc->Register.Index];
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
+		src.reg.index = 0;
+		src.reg.type = 0;
 		break;
 	}
 
@@ -271,7 +437,7 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 	return src;
 }
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
 	switch (fdst->Register.File) {
 	case TGSI_FILE_OUTPUT:
@@ -279,10 +445,10 @@ tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
 	case TGSI_FILE_TEMPORARY:
 		return fpc->r_temp[fdst->Register.Index];
 	case TGSI_FILE_NULL:
-		return nvfx_sr(NVFXSR_NONE, 0);
+		return nvfx_reg(NVFXSR_NONE, 0);
 	default:
 		NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
-		return nvfx_sr(NVFXSR_NONE, 0);
+		return nvfx_reg(NVFXSR_NONE, 0);
 	}
 }
 
@@ -302,8 +468,10 @@ static boolean
 nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				const struct tgsi_full_instruction *finst)
 {
-	const struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
-	struct nvfx_sreg src[3], dst, tmp;
+	const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn;
+	struct nvfx_src src[3], tmp, tmp2;
+	struct nvfx_reg dst;
 	int mask, sat, unit = 0;
 	int ai = -1, ci = -1, ii = -1;
 	int i;
@@ -331,9 +499,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
-				src[i] = temp(fpc);
-				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
-				      tgsi_src(fpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(fpc));
+				nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_CONSTANT:
@@ -342,9 +509,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
-				src[i] = temp(fpc);
-				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
-				      tgsi_src(fpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(fpc));
+				nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_IMMEDIATE:
@@ -353,9 +519,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				ii = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
-				src[i] = temp(fpc);
-				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
-				      tgsi_src(fpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(fpc));
+				nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_TEMPORARY:
@@ -378,277 +543,345 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 
 	switch (finst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
-		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, abs(src[0]), none, none));
 		break;
 	case TGSI_OPCODE_ADD:
-		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_CMP:
-		tmp = nvfx_sr(NVFXSR_NONE, 0);
-		tmp.cc_update = 1;
-		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
-		dst.cc_test = NVFX_COND_GE;
-		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
-		dst.cc_test = NVFX_COND_LT;
-		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		insn = arith(0, MOV, none.reg, mask, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_fp_emit(fpc, insn);
+
+		insn = arith(sat, MOV, dst, mask, src[2], none, none);
+		insn.cc_test = NVFX_COND_GE;
+		nvfx_fp_emit(fpc, insn);
+
+		insn = arith(sat, MOV, dst, mask, src[1], none, none);
+		insn.cc_test = NVFX_COND_LT;
+		nvfx_fp_emit(fpc, insn);
 		break;
 	case TGSI_OPCODE_COS:
-		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_DDX:
 		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
-			tmp = temp(fpc);
-			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
-			      swz(src[0], Z, W, Z, W), none, none);
-			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
-			      swz(tmp, X, Y, X, Y), none, none);
-			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
-			      none, none);
-			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
+			nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
 		} else {
-			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+			nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none));
 		}
 		break;
 	case TGSI_OPCODE_DDY:
 		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
-			tmp = temp(fpc);
-			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
-			      swz(src[0], Z, W, Z, W), none, none);
-			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
-			      swz(tmp, X, Y, X, Y), none, none);
-			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
-			      none, none);
-			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
+			nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
 		} else {
-			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+			nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none));
 		}
 		break;
+	case TGSI_OPCODE_DP2:
+		tmp = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
+		nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
+		break;
 	case TGSI_OPCODE_DP3:
-		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DP4:
-		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DPH:
-		tmp = temp(fpc);
-		arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[1], none);
-		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
-		      swz(src[1], W, W, W, W), none);
+		tmp = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none));
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none));
 		break;
 	case TGSI_OPCODE_DST:
-		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_EX2:
-		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_FLR:
-		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_FRC:
-		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_KILP:
-		arith(fpc, 0, KIL, none, 0, none, none, none);
+		nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none));
 		break;
 	case TGSI_OPCODE_KIL:
-		dst = nvfx_sr(NVFXSR_NONE, 0);
-		dst.cc_update = 1;
-		arith(fpc, 0, MOV, dst, NVFX_FP_MASK_ALL, src[0], none, none);
-		dst.cc_update = 0; dst.cc_test = NVFX_COND_LT;
-		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_fp_emit(fpc, insn);
+
+		insn = arith(0, KIL, none.reg, 0, none, none, none);
+		insn.cc_test = NVFX_COND_LT;
+		nvfx_fp_emit(fpc, insn);
 		break;
 	case TGSI_OPCODE_LG2:
-		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none));
 		break;
 //	case TGSI_OPCODE_LIT:
 	case TGSI_OPCODE_LRP:
 		if(!nvfx->is_nv4x)
-			arith(fpc, sat, LRP_NV30, dst, mask, src[0], src[1], src[2]);
+			nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2]));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
-			arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
+			nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp));
 		}
 		break;
 	case TGSI_OPCODE_MAD:
-		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2]));
 		break;
 	case TGSI_OPCODE_MAX:
-		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MIN:
-		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MOV:
-		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_MUL:
-		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none));
+		break;
+	case TGSI_OPCODE_NOP:
 		break;
 	case TGSI_OPCODE_POW:
 		if(!nvfx->is_nv4x)
-			arith(fpc, sat, POW_NV30, dst, mask, src[0], src[1], none);
+			nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, LG2, tmp, NVFX_FP_MASK_X,
-			      swz(src[0], X, X, X, X), none, none);
-			arith(fpc, 0, MUL, tmp, NVFX_FP_MASK_X, swz(tmp, X, X, X, X),
-			      swz(src[1], X, X, X, X), none);
-			arith(fpc, sat, EX2, dst, mask,
-			      swz(tmp, X, X, X, X), none, none);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
+			nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
+			nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none));
 		}
 		break;
 	case TGSI_OPCODE_RCP:
-		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
-		break;
-	case TGSI_OPCODE_RET:
-		assert(0);
+		nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_RFL:
 		if(!nvfx->is_nv4x)
-			arith(fpc, 0, RFL_NV30, dst, mask, src[0], src[1], none);
+			nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[0], none);
-			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_Y, src[0], src[1], none);
-			arith(fpc, 0, DIV, scale(tmp, 2X), NVFX_FP_MASK_Z,
-			      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
-			arith(fpc, sat, MAD, dst, mask,
-			      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[0], none));
+			nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_Y, src[0], src[1], none));
+			insn = arith(0, DIV, tmp.reg, NVFX_FP_MASK_Z, swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+			insn.scale = NVFX_FP_OP_DST_SCALE_2X;
+			nvfx_fp_emit(fpc, insn);
+			nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, swz(tmp, Z, Z, Z, Z), src[0], neg(src[1])));
 		}
 		break;
 	case TGSI_OPCODE_RSQ:
 		if(!nvfx->is_nv4x)
-			arith(fpc, sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+			nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, LG2, scale(tmp, INV_2X), NVFX_FP_MASK_X,
-			      abs(swz(src[0], X, X, X, X)), none, none);
-			arith(fpc, sat, EX2, dst, mask,
-			      neg(swz(tmp, X, X, X, X)), none, none);
+			tmp = nvfx_src(temp(fpc));
+			insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none);
+			insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X;
+			nvfx_fp_emit(fpc, insn);
+			nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none));
 		}
 		break;
 	case TGSI_OPCODE_SCS:
 		/* avoid overwriting the source */
 		if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
 		{
-			if (mask & NVFX_FP_MASK_X) {
-				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
-				      swz(src[0], X, X, X, X), none, none);
-			}
-			if (mask & NVFX_FP_MASK_Y) {
-				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
-				      swz(src[0], X, X, X, X), none, none);
-			}
+			if (mask & NVFX_FP_MASK_X)
+				nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
+			if (mask & NVFX_FP_MASK_Y)
+				nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
 		}
 		else
 		{
-			if (mask & NVFX_FP_MASK_Y) {
-				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
-				      swz(src[0], X, X, X, X), none, none);
-			}
-			if (mask & NVFX_FP_MASK_X) {
-				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
-				      swz(src[0], X, X, X, X), none, none);
-			}
+			if (mask & NVFX_FP_MASK_Y)
+				nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
+			if (mask & NVFX_FP_MASK_X)
+				nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
 		}
 		break;
 	case TGSI_OPCODE_SEQ:
-		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SFL:
-		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SFL, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGE:
-		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGT:
-		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SIN:
-		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_SLE:
-		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SLT:
-		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SNE:
-		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none));
+		break;
+	case TGSI_OPCODE_SSG:
+		tmp = nvfx_src(temp(fpc));
+		tmp2 = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, SGT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none));
+		nvfx_fp_emit(fpc, arith(0, SLT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none));
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, tmp, neg(tmp2), none));
 		break;
 	case TGSI_OPCODE_STR:
-		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, STR, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SUB:
-		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], neg(src[1]), none));
 		break;
 	case TGSI_OPCODE_TEX:
-		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
 		break;
-	case TGSI_OPCODE_TXB:
-		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+        case TGSI_OPCODE_TRUNC:
+                tmp = nvfx_src(temp(fpc));
+                insn = arith(0, MOV, none.reg, mask, src[0], none, none);
+                insn.cc_update = 1;
+                nvfx_fp_emit(fpc, insn);
+
+                nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
+                nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none));
+
+                insn = arith(sat, MOV, dst, mask, neg(tmp), none, none);
+                insn.cc_test = NVFX_COND_LT;
+                nvfx_fp_emit(fpc, insn);
+                break;
+        case TGSI_OPCODE_TXB:
+                nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none));
+                break;
+        case TGSI_OPCODE_TXL:
+                if(nvfx->is_nv4x)
+                        nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none));
+                else /* unsupported on nv30, use TEX and hope they like it */
+                        nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
+                break;
+        case TGSI_OPCODE_TXP:
+                nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
+                break;
+	case TGSI_OPCODE_XPD:
+		tmp = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
+		nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
 		break;
-	case TGSI_OPCODE_TXP:
-		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+
+	case TGSI_OPCODE_IF:
+		// MOVRC0 R31 (TR0.xyzw), R<src>:
+		// IF (NE.xxxx) ELSE <else> END <end>
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_if(fpc, src[0]);
 		break;
-	case TGSI_OPCODE_XPD:
-		tmp = temp(fpc);
-		arith(fpc, 0, MUL, tmp, mask,
-		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
-		arith(fpc, sat, MAD, dst, (mask & ~NVFX_FP_MASK_W),
-		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
-		      neg(tmp));
+
+	case TGSI_OPCODE_ELSE:
+	{
+		uint32_t *hw;
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+		hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
+		hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
 		break;
-	default:
-		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-		return FALSE;
 	}
 
-	release_temps(fpc);
-	return TRUE;
-}
+	case TGSI_OPCODE_ENDIF:
+	{
+		uint32_t *hw;
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+		hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
+		if(!hw[2])
+			hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+		hw[3] = fpc->fp->insn_len;
+		break;
+	}
 
-static boolean
-nvfx_fragprog_parse_decl_attrib(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
-				const struct tgsi_full_declaration *fdec)
-{
-	int hw;
+	case TGSI_OPCODE_BRA:
+		/* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */
+		/* no state tracker uses this, so don't implement this for now */
+		assert(0);
+		nv40_fp_bra(fpc, finst->Label.Label);
+		break;
 
-	switch (fdec->Semantic.Name) {
-	case TGSI_SEMANTIC_POSITION:
-		hw = NVFX_FP_OP_INPUT_SRC_POSITION;
+	case TGSI_OPCODE_BGNSUB:
+	case TGSI_OPCODE_ENDSUB:
+		/* nothing to do here */
 		break;
-	case TGSI_SEMANTIC_COLOR:
-		if (fdec->Semantic.Index == 0) {
-			hw = NVFX_FP_OP_INPUT_SRC_COL0;
-		} else
-		if (fdec->Semantic.Index == 1) {
-			hw = NVFX_FP_OP_INPUT_SRC_COL1;
-		} else {
-			NOUVEAU_ERR("bad colour semantic index\n");
-			return FALSE;
-		}
+
+	case TGSI_OPCODE_CAL:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_cal(fpc, finst->Label.Label);
 		break;
-	case TGSI_SEMANTIC_FOG:
-		hw = NVFX_FP_OP_INPUT_SRC_FOGC;
+
+	case TGSI_OPCODE_RET:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_ret(fpc);
 		break;
-	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.Index <= 7) {
-			hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.
-						     Index);
-		} else {
-			NOUVEAU_ERR("bad generic semantic index\n");
-			return FALSE;
+
+	case TGSI_OPCODE_BGNLOOP:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		/* TODO: we should support using two nested REPs to allow a > 255 iteration count */
+		nv40_fp_rep(fpc, 255, finst->Label.Label);
+		break;
+
+	case TGSI_OPCODE_ENDLOOP:
+		break;
+
+	case TGSI_OPCODE_BRK:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_brk(fpc);
+		break;
+
+	case TGSI_OPCODE_CONT:
+	{
+		static int warned = 0;
+		if(!warned) {
+			NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
+			warned = 1;
 		}
 		break;
-	default:
-		NOUVEAU_ERR("bad input semantic\n");
+	}
+
+        default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
 		return FALSE;
 	}
 
-	fpc->attrib_map[fdec->Range.First] = hw;
+out:
+	release_temps(fpc);
 	return TRUE;
+nv3x_cflow:
+	{
+		static int warned = 0;
+		if(!warned) {
+			NOUVEAU_ERR(
+					"Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
+					"If rendering is incorrect, try to disable GLSL support in the application.\n");
+			warned = 1;
+		}
+	}
+	goto out;
 }
 
 static boolean
@@ -680,8 +913,8 @@ nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 		return FALSE;
 	}
 
-	fpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
-	fpc->r_temps |= (1 << hw);
+	fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
+	fpc->r_temps |= (1ULL << hw);
 	return TRUE;
 }
 
@@ -690,8 +923,22 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 {
 	struct tgsi_parse_context p;
 	int high_temp = -1, i;
+	struct util_semantic_set set;
+	float const0v[4] = {0, 0, 0, 0};
+	struct nvfx_reg const0;
+
+	fpc->fp->num_slots = util_semantic_set_from_program_file(&set, fpc->pfp->pipe.tokens, TGSI_FILE_INPUT);
+	if(fpc->fp->num_slots > 8)
+		return FALSE;
+	util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, 8);
+	util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, 8);
 
-	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	memset(fpc->fp->slot_to_fp_input, 0xff, sizeof(fpc->fp->slot_to_fp_input));
+
+	const0 = constant(fpc, -1, const0v);
+	assert(const0.index == 0);
+
+	tgsi_parse_init(&p, fpc->pfp->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
 		const union tgsi_full_token *tok = &p.FullToken;
 
@@ -702,10 +949,6 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 			const struct tgsi_full_declaration *fdec;
 			fdec = &p.FullToken.FullDeclaration;
 			switch (fdec->Declaration.File) {
-			case TGSI_FILE_INPUT:
-				if (!nvfx_fragprog_parse_decl_attrib(nvfx, fpc, fdec))
-					goto out_err;
-				break;
 			case TGSI_FILE_OUTPUT:
 				if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec))
 					goto out_err;
@@ -744,40 +987,66 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 	tgsi_parse_free(&p);
 
 	if (++high_temp) {
-		fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
 		for (i = 0; i < high_temp; i++)
 			fpc->r_temp[i] = temp(fpc);
-		fpc->r_temps_discard = 0;
+		fpc->r_temps_discard = 0ULL;
 	}
 
 	return TRUE;
 
 out_err:
-	if (fpc->r_temp)
+	if (fpc->r_temp) {
 		FREE(fpc->r_temp);
+		fpc->r_temp = NULL;
+	}
 	tgsi_parse_free(&p);
 	return FALSE;
 }
 
-static void
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
+
+static struct nvfx_fragment_program*
 nvfx_fragprog_translate(struct nvfx_context *nvfx,
-			struct nvfx_fragment_program *fp)
+			struct nvfx_pipe_fragment_program *pfp,
+			boolean emulate_sprite_flipping)
 {
 	struct tgsi_parse_context parse;
 	struct nvfx_fpc *fpc = NULL;
+	struct util_dynarray insns;
+	struct nvfx_fragment_program* fp = NULL;
+        const int min_size = 4096;
 
-	fpc = CALLOC(1, sizeof(struct nvfx_fpc));
+	fp = CALLOC_STRUCT(nvfx_fragment_program);
+	if(!fp)
+		goto out_err;
+
+	fpc = CALLOC_STRUCT(nvfx_fpc);
 	if (!fpc)
-		return;
+		goto out_err;
+
+	fpc->max_temps = nvfx->is_nv4x ? 48 : 32;
+	fpc->pfp = pfp;
 	fpc->fp = fp;
 	fpc->num_regs = 2;
 
-	if (!nvfx_fragprog_prepare(nvfx, fpc)) {
-		FREE(fpc);
-		return;
-	}
+	if (!nvfx_fragprog_prepare(nvfx, fpc))
+		goto out_err;
 
-	tgsi_parse_init(&parse, fp->pipe.tokens);
+	tgsi_parse_init(&parse, pfp->pipe.tokens);
+	util_dynarray_init(&insns);
+
+	if(emulate_sprite_flipping)
+	{
+		struct nvfx_reg reg = temp(fpc);
+		struct nvfx_src sprite_input = nvfx_src(nvfx_reg(NVFXSR_RELOCATED, fp->num_slots));
+		float v[4] = {1, -1, 0, 0};
+		struct nvfx_src imm = nvfx_src(constant(fpc, -1, v));
+
+		fpc->sprite_coord_temp = reg.index;
+		fpc->r_temps_discard = 0ULL;
+		nvfx_fp_emit(fpc, arith(0, MAD, reg, NVFX_FP_MASK_ALL, sprite_input, swz(imm, X, Y, X, X), swz(imm, Z, X, Z, Z)));
+	}
 
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
@@ -787,6 +1056,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 		{
 			const struct tgsi_full_instruction *finst;
 
+			util_dynarray_append(&insns, unsigned, fp->insn_len);
 			finst = &parse.FullToken.FullInstruction;
 			if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
 				goto out_err;
@@ -796,6 +1066,14 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 			break;
 		}
 	}
+	util_dynarray_append(&insns, unsigned, fp->insn_len);
+
+	for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
+	{
+		struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
+		fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
+	}
+	util_dynarray_fini(&insns);
 
 	if(!nvfx->is_nv4x)
 		fp->fp_control |= (fpc->num_regs-1)/2;
@@ -804,9 +1082,9 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 
 	/* Terminate final instruction */
 	if(fp->insn)
-                fp->insn[fpc->inst_offset] |= 0x00000001;
+		fp->insn[fpc->inst_offset] |= 0x00000001;
 
-	/* Append NOP + END instruction, may or may not be necessary. */
+	/* Append NOP + END instruction for branches to the end of the program */
 	fpc->inst_offset = fp->insn_len;
 	grow_insns(fpc, 4);
 	fp->insn[fpc->inst_offset + 0] = 0x00000001;
@@ -814,12 +1092,48 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 	fp->insn[fpc->inst_offset + 2] = 0x00000000;
 	fp->insn[fpc->inst_offset + 3] = 0x00000000;
 
-	fp->translated = TRUE;
-out_err:
+	if(debug_get_option_nvfx_dump_fp())
+	{
+		debug_printf("\n");
+		tgsi_dump(pfp->pipe.tokens, 0);
+
+		debug_printf("\n%s fragment program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
+		for (unsigned i = 0; i < fp->insn_len; i += 4)
+			debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]);
+		debug_printf("\n");
+	}
+
+        fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+
+        if(fp->prog_size >= min_size)
+                fp->progs_per_bo = 1;
+        else
+                fp->progs_per_bo = min_size / fp->prog_size;
+        fp->bo_prog_idx = fp->progs_per_bo - 1;
+
+out:
 	tgsi_parse_free(&parse);
-	if (fpc->r_temp)
-		FREE(fpc->r_temp);
-	FREE(fpc);
+	if(fpc)
+	{
+		if (fpc->r_temp)
+			FREE(fpc->r_temp);
+		util_dynarray_fini(&fpc->if_stack);
+		util_dynarray_fini(&fpc->label_relocs);
+		//util_dynarray_fini(&fpc->loop_stack);
+		FREE(fpc);
+	}
+	return fp;
+
+out_err:
+	_debug_printf("Error: failed to compile this fragment program:\n");
+	tgsi_dump(pfp->pipe.tokens, 0);
+
+	if(fp)
+	{
+		FREE(fp);
+		fp = NULL;
+	}
+	goto out;
 }
 
 static inline void
@@ -836,53 +1150,189 @@ nvfx_fp_memcpy(void* dst, const void* src, size_t len)
 #endif
 }
 
+/* The hardware only supports immediate constants inside the fragment program,
+ * and at least on nv30 doesn't support an indirect linkage table.
+ *
+ * Hence, we need to patch the fragment program itself both to update constants
+ * and update linkage.
+ *
+ * Using a single fragment program would entail unacceptable stalls if the GPU is
+ * already rendering with that fragment program.
+ * Thus, we instead use a "rotating queue" of buffer objects, each of which is
+ * packed with multiple versions of the same program.
+ *
+ * Whenever we need to patch something, we move to the next program and
+ * patch it. If all buffer objects are in use by the GPU, we allocate another one,
+ * expanding the queue.
+ *
+ * As an additional optimization, we record when all the programs have the
+ * current input slot configuration, and at that point we stop patching inputs.
+ * This happens, for instance, if a given fragment program is always used with
+ * the same vertex program (i.e. always with GLSL), or if the layouts match
+ * enough (non-GLSL).
+ *
+ * Note that instead of using multiple programs, we could push commands
+ * on the FIFO to patch a single program: it's not fully clear which option is
+ * faster, but my guess is that the current way is faster.
+ *
+ * We also track the previous slot assignments for each version and don't
+ * patch if they are the same (this could perhaps be removed).
+ */
+
 void
 nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
-	int update = 0;
-
-	if (!fp->translated)
+	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
+	struct nvfx_vertex_program* vp;
+	/* Gallium always puts the point coord in GENERIC[0]
+	 * TODO: this is wrong, Gallium needs to be fixed
+	 */
+	unsigned sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * (nvfx->rasterizer->pipe.sprite_coord_enable | 1);
+
+	boolean emulate_sprite_flipping = sprite_coord_enable && nvfx->rasterizer->pipe.sprite_coord_mode;
+	unsigned key = emulate_sprite_flipping;
+	struct nvfx_fragment_program* fp;
+
+	fp = pfp->fps[key];
+	if (!fp)
 	{
-		const int min_size = 4096;
+		fp = nvfx_fragprog_translate(nvfx, pfp, emulate_sprite_flipping);
 
-		nvfx_fragprog_translate(nvfx, fp);
-		if (!fp->translated) {
-			static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0};
-			static int warned = 0;
-			if(!warned)
+		if(!fp)
+		{
+			if(!nvfx->dummy_fs)
 			{
-				fprintf(stderr, "nvfx: failed to translate fragment program!\n");
-				warned = 1;
+				struct ureg_program *ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
+				if (ureg)
+				{
+					ureg_END( ureg );
+					nvfx->dummy_fs = ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
+				}
+
+				if(!nvfx->dummy_fs)
+				{
+					_debug_printf("Error: unable to create a dummy fragment shader: aborting.");
+					abort();
+				}
 			}
 
-			/* use dummy program: we cannot fail here */
-			fp->translated = TRUE;
-			fp->insn = malloc(sizeof(dummy));
-			memcpy(fp->insn, dummy, sizeof(dummy));
-			fp->insn_len = sizeof(dummy) / sizeof(dummy[0]);
+			fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs, FALSE);
+			emulate_sprite_flipping = FALSE;
+
+			if(!fp)
+			{
+				_debug_printf("Error: unable to compile even a dummy fragment shader: aborting.");
+				abort();
+			}
 		}
-		update = TRUE;
 
-		fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+		pfp->fps[key] = fp;
+	}
+
+	vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
 
-		if(fp->prog_size >= min_size)
-			fp->progs_per_bo = 1;
+	if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
+		int sprite_real_input = -1;
+		int sprite_reloc_input;
+		unsigned i;
+		fp->last_vp_id = vp->id;
+		fp->last_sprite_coord_enable = sprite_coord_enable;
+
+		if(sprite_coord_enable)
+		{
+			sprite_real_input = vp->sprite_fp_input;
+			if(sprite_real_input < 0)
+			{
+				unsigned used_texcoords = 0;
+				for(unsigned i = 0; i < fp->num_slots; ++i) {
+					unsigned generic = fp->slot_to_generic[i];
+					if(!((1 << generic) & sprite_coord_enable))
+					{
+						unsigned char slot_mask = vp->generic_to_fp_input[generic];
+						if(slot_mask >= 0xf0)
+							used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0);
+					}
+				}
+
+				sprite_real_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
+			}
+
+			fp->point_sprite_control |= (1 << (sprite_real_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
+		}
 		else
-			fp->progs_per_bo = min_size / fp->prog_size;
-		fp->bo_prog_idx = fp->progs_per_bo - 1;
-	}
+			fp->point_sprite_control = 0;
 
-	/* we must update constants even on "just" fragprog changes, because
-	   we don't check whether the current constant buffer matches the latest
-	   one bound to this fragment program */
-	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG))
-		update = TRUE;
+		if(emulate_sprite_flipping)
+		   sprite_reloc_input = 0;
+		else
+		   sprite_reloc_input = sprite_real_input;
 
-	if(update) {
+		for(i = 0; i < fp->num_slots; ++i) {
+			unsigned generic = fp->slot_to_generic[i];
+			if((1 << generic) & sprite_coord_enable)
+			{
+				if(fp->slot_to_fp_input[i] != sprite_reloc_input)
+					goto update_slots;
+			}
+			else
+			{
+				unsigned char slot_mask = vp->generic_to_fp_input[generic];
+				if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i]))
+					goto update_slots;
+			}
+		}
+
+		if(emulate_sprite_flipping)
+		{
+			if(fp->slot_to_fp_input[fp->num_slots] != sprite_real_input)
+				goto update_slots;
+		}
+
+		if(0)
+		{
+update_slots:
+			/* optimization: we start updating from the slot we found the first difference in */
+			for(; i < fp->num_slots; ++i)
+			{
+				unsigned generic = fp->slot_to_generic[i];
+				if((1 << generic) & sprite_coord_enable)
+					fp->slot_to_fp_input[i] = sprite_reloc_input;
+				else
+					fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf;
+			}
+
+			fp->slot_to_fp_input[fp->num_slots] = sprite_real_input;
+
+			if(nvfx->is_nv4x)
+			{
+				fp->or = 0;
+				for(i = 0; i <= fp->num_slots; ++i) {
+					unsigned fp_input = fp->slot_to_fp_input[i];
+					if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8))
+						fp->or |= (1 << 12);
+					else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9))
+						fp->or |= (1 << 13);
+					else if(fp_input >= NVFX_FP_OP_INPUT_SRC_TC(0) && fp_input <= NVFX_FP_OP_INPUT_SRC_TC(7))
+						fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14));
+				}
+			}
+
+			fp->progs_left_with_obsolete_slot_assignments = fp->progs;
+			goto update;
+		}
+	}
+
+	/* We must update constants even on "just" fragprog changes, because
+	  * we don't check whether the current constant buffer matches the latest
+	  * one bound to this fragment program.
+	  * Doing such a check would likely be a pessimization.
+	  */
+	if ((nvfx->hw_fragprog != fp) || (nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))) {
 		int offset;
+		uint32_t* fpmap;
 
+update:
 		++fp->bo_prog_idx;
 		if(fp->bo_prog_idx >= fp->progs_per_bo)
 		{
@@ -892,10 +1342,12 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			}
 			else
 			{
-				struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16);
-				char *map, *buf;
-				int i;
+				struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + (fp->prog_size + 8) * fp->progs_per_bo, 16);
+				uint8_t* map;
+				uint8_t* buf;
 
+				fpbo->slots = (unsigned char*)&fpbo->insn[(fp->prog_size) * fp->progs_per_bo];
+				memset(fpbo->slots, 0, 8 * fp->progs_per_bo);
 				if(fp->fpbo)
 				{
 					fpbo->next = fp->fpbo->next;
@@ -905,12 +1357,14 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 					fpbo->next = fpbo;
 				fp->fpbo = fpbo;
 				fpbo->bo = 0;
+				fp->progs += fp->progs_per_bo;
+				fp->progs_left_with_obsolete_slot_assignments += fp->progs_per_bo;
 				nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo);
 				nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC);
 
 				map = fpbo->bo->map;
-				buf = fpbo->insn;
-				for(i = 0; i < fp->progs_per_bo; ++i)
+				buf = (uint8_t*)fpbo->insn;
+				for(unsigned i = 0; i < fp->progs_per_bo; ++i)
 				{
 					memcpy(buf, fp->insn, fp->insn_len * 4);
 					nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4);
@@ -922,13 +1376,11 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		}
 
 		offset = fp->bo_prog_idx * fp->prog_size;
+		fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
 
 		if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
 			struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
-			// TODO: avoid using transfers, just directly the buffer
-			struct pipe_transfer* transfer;
-			// TODO: does this check make any sense, or should we do this unconditionally?
-			uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer);
+			uint32_t* map = (uint32_t*)nvfx_buffer(constbuf)->data;
 			uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
 			uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
 			int i;
@@ -942,12 +1394,61 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 					nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
 				}
 			}
-			pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
 		}
-	}
 
-	if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
-		int offset = fp->bo_prog_idx * fp->prog_size;
+		/* we only do this if we aren't sure that all program versions have the
+		 * current slot assignments, otherwise we just update constants for speed
+		 */
+		if(fp->progs_left_with_obsolete_slot_assignments) {
+			unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8];
+			/* also relocate sprite coord slot, if any */
+			for(unsigned i = 0; i <= fp->num_slots; ++i) {
+				unsigned value = fp->slot_to_fp_input[i];;
+				if(value != fpbo_slots[i]) {
+					unsigned* p;
+					unsigned* begin = (unsigned*)fp->slot_relocations[i].data;
+					unsigned* end = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size);
+					//printf("fp %p reloc slot %u/%u: %u -> %u\n", fp, i, fp->num_slots, fpbo_slots[i], value);
+					if(value == 0)
+					{
+						/* was relocated to an input, switch type to temporary */
+						for(p = begin; p != end; ++p) {
+							unsigned off = *p;
+							unsigned dw = fp->insn[off];
+							dw &=~ NVFX_FP_REG_TYPE_MASK;
+							//printf("reloc_tmp at %x\n", off);
+							nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+						}
+					} else {
+						if(!fpbo_slots[i])
+						{
+							/* was relocated to a temporary, switch type to input */
+							for(p= begin; p != end; ++p) {
+								unsigned off = *p;
+								unsigned dw = fp->insn[off];
+								//printf("reloc_in at %x\n", off);
+								dw |= NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT;
+								nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+							}
+						}
+
+						/* set the correct input index */
+						for(p = begin; p != end; ++p) {
+							unsigned off = *p & ~3;
+							unsigned dw = fp->insn[off];
+							//printf("reloc&~3 at %x\n", off);
+							dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT);
+							nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+						}
+					}
+					fpbo_slots[i] = value;
+				}
+			}
+			--fp->progs_left_with_obsolete_slot_assignments;
+		}
+
+		nvfx->hw_fragprog = fp;
+
 		MARK_RING(chan, 8, 1);
 		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
 		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
@@ -963,13 +1464,26 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			OUT_RING(chan, fp->samplers);
 		}
 	}
+
+	{
+		unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization;
+		if(pointsprite_control != nvfx->hw_pointsprite_control)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
+			OUT_RING(chan, pointsprite_control);
+			nvfx->hw_pointsprite_control = pointsprite_control;
+		}
+	}
+
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG;
 }
 
 void
 nvfx_fragprog_relocate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nvfx_fragment_program *fp = nvfx->hw_fragprog;
 	struct nouveau_bo* bo = fp->fpbo->bo;
 	int offset = fp->bo_prog_idx * fp->prog_size;
 	unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART?
@@ -979,12 +1493,14 @@ nvfx_fragprog_relocate(struct nvfx_context *nvfx)
 	OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW |
 		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
 		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG;
 }
 
 void
 nvfx_fragprog_destroy(struct nvfx_context *nvfx,
 		      struct nvfx_fragment_program *fp)
 {
+	unsigned i;
 	struct nvfx_fragment_program_bo* fpbo = fp->fpbo;
 	if(fpbo)
 	{
@@ -999,7 +1515,60 @@ nvfx_fragprog_destroy(struct nvfx_context *nvfx,
 		while(fpbo != fp->fpbo);
 	}
 
+	for(i = 0; i < Elements(fp->slot_relocations); ++i)
+		util_dynarray_fini(&fp->slot_relocations[i]);
+
 	if (fp->insn_len)
 		FREE(fp->insn);
 }
 
+static void *
+nvfx_fp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+        struct nvfx_pipe_fragment_program *pfp;
+
+        pfp = CALLOC(1, sizeof(struct nvfx_pipe_fragment_program));
+        pfp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+        tgsi_scan_shader(pfp->pipe.tokens, &pfp->info);
+
+        return (void *)pfp;
+}
+
+static void
+nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+
+        nvfx->fragprog = hwcso;
+        nvfx->dirty |= NVFX_NEW_FRAGPROG;
+}
+
+static void
+nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_pipe_fragment_program *pfp = hwcso;
+	unsigned i;
+
+	for(i = 0; i < Elements(pfp->fps); ++i)
+	{
+		if(pfp->fps[i])
+		{
+			nvfx_fragprog_destroy(nvfx, pfp->fps[i]);
+			FREE(pfp->fps[i]);
+		}
+	}
+
+        FREE((void*)pfp->pipe.tokens);
+        FREE(pfp);
+}
+
+void
+nvfx_init_fragprog_functions(struct nvfx_context *nvfx)
+{
+        nvfx->pipe.create_fs_state = nvfx_fp_state_create;
+        nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
+        nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_fragtex.c b/src/gallium/drivers/nvfx/nvfx_fragtex.c
index 0b4a434fec..6503c7afcb 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragtex.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragtex.c
@@ -1,5 +1,177 @@
 #include "nvfx_context.h"
 #include "nvfx_resource.h"
+#include "nvfx_tex.h"
+
+static void *
+nvfx_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_sampler_state *ps;
+
+	ps = MALLOC(sizeof(struct nvfx_sampler_state));
+
+	/* on nv30, we use this as an internal flag */
+	ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT;
+	ps->en = 0;
+	ps->filt = nvfx_tex_filter(cso) | 0x2000; /*voodoo*/
+	ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT);
+	ps->compare = FALSE;
+
+	if(cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+	{
+		ps->wrap |= nvfx_tex_wrap_compare_mode(cso->compare_func);
+		ps->compare = TRUE;
+	}
+	ps->bcol = nvfx_tex_border_color(cso->border_color);
+
+	if(nvfx->is_nv4x)
+		nv40_sampler_state_init(pipe, ps, cso);
+	else
+		nv30_sampler_state_init(pipe, ps, cso);
+
+	return (void *)ps;
+}
+
+static void
+nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nvfx->tex_sampler[unit] = sampler[unit];
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_samplers; unit++) {
+		nvfx->tex_sampler[unit] = NULL;
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_samplers = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+static struct pipe_sampler_view *
+nvfx_create_sampler_view(struct pipe_context *pipe,
+			 struct pipe_resource *pt,
+			 const struct pipe_sampler_view *templ)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_sampler_view *sv = CALLOC_STRUCT(nvfx_sampler_view);
+	struct nvfx_texture_format *tf = &nvfx_texture_formats[templ->format];
+	unsigned txf;
+
+	if (!sv)
+		return NULL;
+
+	sv->base = *templ;
+	sv->base.reference.count = 1;
+	sv->base.texture = NULL;
+	pipe_resource_reference(&sv->base.texture, pt);
+	sv->base.context = pipe;
+
+	txf = NV34TCL_TX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+	case PIPE_TEXTURE_RECT:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		assert(0);
+	}
+	sv->u.init_fmt = txf;
+
+	sv->swizzle = 0
+			| (tf->src[sv->base.swizzle_r] << NV34TCL_TX_SWIZZLE_S0_Z_SHIFT)
+			| (tf->src[sv->base.swizzle_g] << NV34TCL_TX_SWIZZLE_S0_Y_SHIFT)
+			| (tf->src[sv->base.swizzle_b] << NV34TCL_TX_SWIZZLE_S0_X_SHIFT)
+			| (tf->src[sv->base.swizzle_a] << NV34TCL_TX_SWIZZLE_S0_W_SHIFT)
+			| (tf->comp[sv->base.swizzle_r] << NV34TCL_TX_SWIZZLE_S1_Z_SHIFT)
+			| (tf->comp[sv->base.swizzle_g] << NV34TCL_TX_SWIZZLE_S1_Y_SHIFT)
+			| (tf->comp[sv->base.swizzle_b] << NV34TCL_TX_SWIZZLE_S1_X_SHIFT)
+			| (tf->comp[sv->base.swizzle_a] << NV34TCL_TX_SWIZZLE_S1_W_SHIFT);
+
+	sv->filt = tf->sign;
+	sv->wrap = tf->wrap;
+	sv->wrap_mask = ~0;
+
+	if (pt->target == PIPE_TEXTURE_CUBE)
+	{
+		sv->offset = 0;
+		sv->npot_size = (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0;
+	}
+	else
+	{
+		sv->offset = nvfx_subresource_offset(pt, 0, sv->base.first_level, 0);
+		sv->npot_size = (u_minify(pt->width0, sv->base.first_level) << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | u_minify(pt->height0, sv->base.first_level);
+
+		/* apparently, we need to ignore the t coordinate for 1D textures to fix piglit tex1d-2dborder */
+		if(pt->target == PIPE_TEXTURE_1D)
+		{
+			sv->wrap_mask &=~ NV34TCL_TX_WRAP_T_MASK;
+			sv->wrap |= NV34TCL_TX_WRAP_T_REPEAT;
+		}
+	}
+
+	if(nvfx->is_nv4x)
+		nv40_sampler_view_init(pipe, sv);
+	else
+		nv30_sampler_view_init(pipe, sv);
+
+	return &sv->base;
+}
+
+static void
+nvfx_sampler_view_destroy(struct pipe_context *pipe,
+			  struct pipe_sampler_view *view)
+{
+	pipe_resource_reference(&view->texture, NULL);
+	FREE(view);
+}
+
+static void
+nvfx_set_fragment_sampler_views(struct pipe_context *pipe,
+				unsigned nr,
+				struct pipe_sampler_view **views)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            views[unit]);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_textures; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            NULL);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_textures = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
 
 void
 nvfx_fragtex_validate(struct nvfx_context *nvfx)
@@ -16,6 +188,10 @@ nvfx_fragtex_validate(struct nvfx_context *nvfx)
 		samplers &= ~(1 << unit);
 
 		if(nvfx->fragment_sampler_views[unit] && nvfx->tex_sampler[unit]) {
+			util_dirty_surfaces_use_for_sampling(&nvfx->pipe,
+					&((struct nvfx_miptree*)nvfx->fragment_sampler_views[unit]->texture)->dirty_surfaces,
+					nvfx_surface_flush);
+
 			if(!nvfx->is_nv4x)
 				nv30_fragtex_set(nvfx, unit);
 			else
@@ -29,6 +205,7 @@ nvfx_fragtex_validate(struct nvfx_context *nvfx)
 		}
 	}
 	nvfx->dirty_samplers = 0;
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGTEX;
 }
 
 void
@@ -55,4 +232,128 @@ nvfx_fragtex_relocate(struct nvfx_context *nvfx)
 		OUT_RELOC(chan, bo, nvfx->hw_txf[unit], tex_flags | NOUVEAU_BO_OR | NOUVEAU_BO_DUMMY,
 				NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
 	}
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGTEX;
+}
+
+void
+nvfx_init_sampling_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_sampler_state = nvfx_sampler_state_create;
+	nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind;
+	nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete;
+	nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views;
+	nvfx->pipe.create_sampler_view = nvfx_create_sampler_view;
+	nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy;
+}
+
+#define NV34TCL_TX_FORMAT_FORMAT_DXT1_RECT NV34TCL_TX_FORMAT_FORMAT_DXT1
+#define NV34TCL_TX_FORMAT_FORMAT_DXT3_RECT NV34TCL_TX_FORMAT_FORMAT_DXT3
+#define NV34TCL_TX_FORMAT_FORMAT_DXT5_RECT NV34TCL_TX_FORMAT_FORMAT_DXT5
+
+#define NV40TCL_TEX_FORMAT_FORMAT_HILO16 NV40TCL_TEX_FORMAT_FORMAT_A16L16
+
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA16F 0x00004a00
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA16F_RECT NV34TCL_TX_FORMAT_FORMAT_RGBA16F
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA32F 0x00004b00
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA32F_RECT NV34TCL_TX_FORMAT_FORMAT_RGBA32F
+#define NV34TCL_TX_FORMAT_FORMAT_R32F 0x00004c00
+#define NV34TCL_TX_FORMAT_FORMAT_R32F_RECT NV34TCL_TX_FORMAT_FORMAT_R32F
+
+// TODO: guess!
+#define NV40TCL_TEX_FORMAT_FORMAT_R32F 0x00001c00
+
+#define SRGB 0x00700000
+
+#define __(m,tf,tfc,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign,wrap) \
+[PIPE_FORMAT_##m] = { \
+  {NV34TCL_TX_FORMAT_FORMAT_##tf, \
+  NV34TCL_TX_FORMAT_FORMAT_##tfc, \
+  NV34TCL_TX_FORMAT_FORMAT_##tf##_RECT, \
+  NV34TCL_TX_FORMAT_FORMAT_##tfc##_RECT, \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf, \
+  NV40TCL_TEX_FORMAT_FORMAT_##tfc}, \
+  sign, wrap, \
+  {ts0z, ts0y, ts0x, ts0w, 0, 1}, {ts1z, ts1y, ts1x, ts1w, 0, 0} \
 }
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign, wrap) \
+	__(m,tf,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign, wrap)
+
+/* Depth formats works by reading the depth value most significant 8/16 bits.
+ * We are losing precision, but nVidia loses even more by using A8R8G8B8 instead of HILO16
+ * There is no 32-bit integer texture support, so other things are infeasible.
+ *
+ * TODO: is it possible to read 16 bits for Z16? A16 doesn't seem to work, either due to normalization or endianness issues
+ */
+
+#define T 2
+
+#define X 3
+#define Y 2
+#define Z 1
+#define W 0
+
+#define SNORM ((NV34TCL_TX_FILTER_SIGNED_RED) | (NV34TCL_TX_FILTER_SIGNED_GREEN) | (NV34TCL_TX_FILTER_SIGNED_BLUE) | (NV34TCL_TX_FILTER_SIGNED_ALPHA))
+#define UNORM 0
+
+struct nvfx_texture_format
+nvfx_texture_formats[PIPE_FORMAT_COUNT] = {
+	[0 ... PIPE_FORMAT_COUNT - 1] = {{-1, -1, -1, -1, -1, -1}},
+	_(B8G8R8X8_UNORM,	A8R8G8B8,	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+	_(B8G8R8X8_SRGB,	A8R8G8B8,	T, T, T, 1, X, Y, Z, W, UNORM, SRGB),
+	_(B8G8R8A8_UNORM,	A8R8G8B8,	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(B8G8R8A8_SRGB,	A8R8G8B8,	T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+
+	_(R8G8B8A8_UNORM,	A8R8G8B8,	T, T, T, T, Z, Y, X, W, UNORM, 0),
+	_(R8G8B8A8_SRGB,	A8R8G8B8,	T, T, T, T, Z, Y, X, W, UNORM, SRGB),
+	_(R8G8B8X8_UNORM,	A8R8G8B8,	T, T, T, 1, Z, Y, X, W, UNORM, 0),
+
+	_(A8R8G8B8_UNORM,	A8R8G8B8,	T, T, T, T, W, Z, Y, X, UNORM, 0),
+	_(A8R8G8B8_SRGB,	A8R8G8B8,	T, T, T, T, W, Z, Y, X, UNORM, SRGB),
+	_(A8B8G8R8_UNORM,	A8R8G8B8,	T, T, T, T, W, X, Y, Z, UNORM, 0),
+	_(A8B8G8R8_SRGB,	A8R8G8B8,	T, T, T, T, W, X, Y, Z, UNORM, SRGB),
+	_(X8R8G8B8_UNORM,	A8R8G8B8,	T, T, T, 1, W, Z, Y, X, UNORM, 0),
+	_(X8R8G8B8_SRGB,	A8R8G8B8,	T, T, T, 1, W, Z, Y, X, UNORM, SRGB),
+
+	_(B5G5R5A1_UNORM,	A1R5G5B5, 	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(B5G5R5X1_UNORM,	A1R5G5B5, 	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+
+	_(B4G4R4A4_UNORM,	A4R4G4B4, 	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(B4G4R4X4_UNORM,	A4R4G4B4, 	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+
+	_(B5G6R5_UNORM,		R5G6B5, 	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+
+	_(R8_UNORM,		L8,		T, 0, 0, 1, X, X, X, X, UNORM, 0),
+	_(R8_SNORM,		L8,		T, 0, 0, 1, X, X, X, X, SNORM, 0),
+	_(L8_UNORM,		L8,		T, T, T, 1, X, X, X, X, UNORM, 0),
+	_(L8_SRGB,		L8,		T, T, T, 1, X, X, X, X, UNORM, SRGB),
+	_(A8_UNORM,		L8, 		0, 0, 0, T, X, X, X, X, UNORM, 0),
+	_(I8_UNORM,		L8, 		T, T, T, T, X, X, X, X, UNORM, 0),
+
+	_(R8G8_UNORM,		A8L8, 		T, T, T, T, X, X, X, W, UNORM, 0),
+	_(R8G8_SNORM,		A8L8, 		T, T, T, T, X, X, X, W, SNORM, 0),
+	_(L8A8_UNORM,		A8L8, 		T, T, T, T, X, X, X, W, UNORM, 0),
+	_(L8A8_SRGB,		A8L8,		T, T, T, T, X, X, X, W, UNORM, SRGB),
+
+	_(DXT1_RGB,		DXT1,		T, T, T, 1, X, Y, Z, W, UNORM, 0),
+	_(DXT1_SRGB,		DXT1,		T, T, T, 1, X, Y, Z, W, UNORM, SRGB),
+	_(DXT1_RGBA,		DXT1,		T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(DXT1_SRGBA,		DXT1,		T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+	_(DXT3_RGBA,		DXT3,		T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(DXT3_SRGBA,		DXT3,		T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+	_(DXT5_RGBA,		DXT5,		T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(DXT5_SRGBA,		DXT5,		T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+
+	__(Z16_UNORM,		A8L8, Z16,	T, T, T, 1, W, W, W, W, UNORM, 0),
+	__(S8_USCALED_Z24_UNORM,HILO16,Z24,	T, T, T, 1, W, W, W, W, UNORM, 0),
+	__(X8Z24_UNORM,		HILO16,Z24,	T, T, T, 1, W, W, W, W, UNORM, 0),
+
+	_(R16_UNORM,		A16,		T, 0, 0, 1, X, X, X, X, UNORM, 0),
+	_(R16_SNORM,		A16,		T, 0, 0, 1, X, X, X, X, SNORM, 0),
+	_(R16G16_UNORM,		HILO16,		T, T, 0, 1, X, Y, X, X, UNORM, 0),
+	_(R16G16_SNORM,		HILO16,		T, T, 0, 1, X, Y, X, X, SNORM, 0),
+
+	_(R16G16B16A16_FLOAT,		RGBA16F,	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(R32G32B32A32_FLOAT,		RGBA32F,	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(R32_FLOAT,		R32F,	T, 0, 0, 1, X, X, X, X, UNORM, 0)
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_miptree.c b/src/gallium/drivers/nvfx/nvfx_miptree.c
index b5639bb464..0916aaa828 100644
--- a/src/gallium/drivers/nvfx/nvfx_miptree.c
+++ b/src/gallium/drivers/nvfx/nvfx_miptree.c
@@ -2,309 +2,220 @@
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "util/u_memory.h"
 #include "util/u_math.h"
-
-#include "nvfx_context.h"
+#include "util/u_staging.h"
+#include "state_tracker/drm_driver.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_screen.h"
+#include "nvfx_screen.h"
 #include "nvfx_resource.h"
-#include "nvfx_transfer.h"
-#include "nv04_surface_2d.h"
-
-/* Currently using separate implementations for buffers and textures,
- * even though gallium has a unified abstraction of these objects.
- * Eventually these should be combined, and mechanisms like transfers
- * be adapted to work for both buffer and texture uploads.
- */
 
 static void
-nvfx_miptree_layout(struct nvfx_miptree *mt)
+nvfx_miptree_choose_format(struct nvfx_miptree *mt)
 {
 	struct pipe_resource *pt = &mt->base.base;
-	uint width = pt->width0;
-	uint offset = 0;
-	int nr_faces, l, f;
-	uint wide_pitch = pt->bind & (PIPE_BIND_SAMPLER_VIEW |
-				      PIPE_BIND_DEPTH_STENCIL |
-				      PIPE_BIND_RENDER_TARGET |
-				      PIPE_BIND_DISPLAY_TARGET |
-				      PIPE_BIND_SCANOUT);
-
-	if (pt->target == PIPE_TEXTURE_CUBE) {
-		nr_faces = 6;
-	} else
-	if (pt->target == PIPE_TEXTURE_3D) {
-		nr_faces = pt->depth0;
-	} else {
-		nr_faces = 1;
+	unsigned uniform_pitch = 0;
+	static int no_swizzle = -1;
+	if(no_swizzle < 0)
+		no_swizzle = debug_get_bool_option("NV40_NO_SWIZZLE", FALSE); /* this will break things on nv30 */
+
+	if (!util_is_power_of_two(pt->width0) ||
+	    !util_is_power_of_two(pt->height0) ||
+	    !util_is_power_of_two(pt->depth0) ||
+	    (!nvfx_screen(pt->screen)->is_nv4x && pt->target == PIPE_TEXTURE_RECT)
+	    )
+		uniform_pitch = 1;
+
+	if (
+		(pt->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_DISPLAY_TARGET))
+		|| (pt->usage & PIPE_USAGE_DYNAMIC) || (pt->usage & PIPE_USAGE_STAGING)
+		|| util_format_is_compressed(pt->format)
+		|| no_swizzle
+	)
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+
+	/* non compressed formats with uniform pitch must be linear, and vice versa */
+	if(!util_format_is_s3tc(pt->format)
+		&& (uniform_pitch || mt->base.base.flags & NVFX_RESOURCE_FLAG_LINEAR))
+	{
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+		uniform_pitch = 1;
 	}
 
-	for (l = 0; l <= pt->last_level; l++) {
-		if (wide_pitch && (pt->flags & NVFX_RESOURCE_FLAG_LINEAR))
-			mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
-		else
-			mt->level[l].pitch = util_format_get_stride(pt->format, width);
+	if(uniform_pitch)
+	{
+		mt->linear_pitch = util_format_get_stride(pt->format, pt->width0);
 
-		mt->level[l].image_offset =
-			CALLOC(nr_faces, sizeof(unsigned));
+		// TODO: this is only a constraint for rendering and not sampling, apparently
+		// we may also want this unconditionally
+		if(pt->bind & (PIPE_BIND_SAMPLER_VIEW |
+			PIPE_BIND_DEPTH_STENCIL |
+			PIPE_BIND_RENDER_TARGET |
+			PIPE_BIND_DISPLAY_TARGET |
+			PIPE_BIND_SCANOUT))
+			mt->linear_pitch = align(mt->linear_pitch, 64);
+	}
+	else
+		mt->linear_pitch = 0;
+}
+
+static unsigned
+nvfx_miptree_layout(struct nvfx_miptree *mt)
+{
+	struct pipe_resource* pt = &mt->base.base;
+        uint offset = 0;
 
-		width  = u_minify(width, 1);
+	if(!nvfx_screen(pt->screen)->is_nv4x)
+	{
+		assert(pt->target == PIPE_TEXTURE_RECT
+			|| (util_is_power_of_two(pt->width0) && util_is_power_of_two(pt->height0)));
 	}
 
-	for (f = 0; f < nr_faces; f++) {
-		for (l = 0; l < pt->last_level; l++) {
-			mt->level[l].image_offset[f] = offset;
+	for (unsigned l = 0; l <= pt->last_level; l++)
+	{
+		unsigned size;
+		mt->level_offset[l] = offset;
 
-			if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR) &&
-			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
-				offset += align(mt->level[l].pitch * u_minify(pt->height0, l), 64);
-			else
-				offset += mt->level[l].pitch * u_minify(pt->height0, l);
-		}
+		if(mt->linear_pitch)
+			size = mt->linear_pitch;
+		else
+			size = util_format_get_stride(pt->format, u_minify(pt->width0, l));
+		size = util_format_get_2d_size(pt->format, size, u_minify(pt->height0, l));
 
-		mt->level[l].image_offset[f] = offset;
-		offset += mt->level[l].pitch * u_minify(pt->height0, l);
+		if(pt->target == PIPE_TEXTURE_3D)
+			size *= u_minify(pt->depth0, l);
+
+		offset += size;
 	}
 
-	mt->total_size = offset;
+	offset = align(offset, 128);
+	mt->face_size = offset;
+	if(mt->base.base.target == PIPE_TEXTURE_CUBE)
+		offset += 5 * mt->face_size;
+	return offset;
 }
 
-static boolean
-nvfx_miptree_get_handle(struct pipe_screen *pscreen,
-			struct pipe_resource *ptexture,
-			struct winsys_handle *whandle)
+static void
+nvfx_miptree_surface_final_destroy(struct pipe_surface* ps)
 {
-	struct nvfx_miptree* mt = (struct nvfx_miptree*)ptexture;
-
-	if (!mt || !mt->base.bo)
-		return FALSE;
-
-	return nouveau_screen_bo_get_handle(pscreen,
-					    mt->base.bo,
-					    mt->level[0].pitch,
-					    whandle);
+	struct nvfx_surface* ns = (struct nvfx_surface*)ps;
+	pipe_resource_reference(&ps->texture, 0);
+	pipe_resource_reference((struct pipe_resource**)&ns->temp, 0);
+	FREE(ps);
 }
 
-
-static void
+void
 nvfx_miptree_destroy(struct pipe_screen *screen, struct pipe_resource *pt)
 {
 	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
-	int l;
-
+	util_surfaces_destroy(&mt->surfaces, pt, nvfx_miptree_surface_final_destroy);
 	nouveau_screen_bo_release(screen, mt->base.bo);
-
-	for (l = 0; l <= pt->last_level; l++) {
-		if (mt->level[l].image_offset)
-			FREE(mt->level[l].image_offset);
-	}
-
 	FREE(mt);
 }
 
-
-
-
-struct u_resource_vtbl nvfx_miptree_vtbl = 
+static struct nvfx_miptree*
+nvfx_miptree_create_skeleton(struct pipe_screen *pscreen, const struct pipe_resource *pt)
 {
-   nvfx_miptree_get_handle,	      /* get_handle */
-   nvfx_miptree_destroy,	      /* resource_destroy */
-   NULL,			      /* is_resource_referenced */
-   nvfx_miptree_transfer_new,	      /* get_transfer */
-   nvfx_miptree_transfer_del,     /* transfer_destroy */
-   nvfx_miptree_transfer_map,	      /* transfer_map */
-   u_default_transfer_flush_region,   /* transfer_flush_region */
-   nvfx_miptree_transfer_unmap,	      /* transfer_unmap */
-   u_default_transfer_inline_write    /* transfer_inline_write */
-};
+        struct nvfx_miptree *mt;
 
+        if(pt->width0 > 4096 || pt->height0 > 4096)
+                return NULL;
 
+        mt = CALLOC_STRUCT(nvfx_miptree);
+        if (!mt)
+                return NULL;
 
-struct pipe_resource *
-nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt)
-{
-	struct nvfx_miptree *mt;
-	static int no_swizzle = -1;
-	if(no_swizzle < 0)
-		no_swizzle = debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE);
-
-	mt = CALLOC_STRUCT(nvfx_miptree);
-	if (!mt)
-		return NULL;
-
-	mt->base.base = *pt;
-	mt->base.vtbl = &nvfx_miptree_vtbl;
-	pipe_reference_init(&mt->base.base.reference, 1);
-	mt->base.base.screen = pscreen;
+        mt->base.base = *pt;
+        util_dirty_surfaces_init(&mt->dirty_surfaces);
 
-	/* Swizzled textures must be POT */
-	if (pt->width0 & (pt->width0 - 1) ||
-	    pt->height0 & (pt->height0 - 1))
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	else
-	if (pt->bind & (PIPE_BIND_SCANOUT |
-			PIPE_BIND_DISPLAY_TARGET |
-			PIPE_BIND_DEPTH_STENCIL))
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	else
-	if (pt->usage == PIPE_USAGE_DYNAMIC)
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	else {
-		switch (pt->format) {
-		case PIPE_FORMAT_B5G6R5_UNORM:
-		case PIPE_FORMAT_L8A8_UNORM:
-		case PIPE_FORMAT_A8_UNORM:
-		case PIPE_FORMAT_L8_UNORM:
-		case PIPE_FORMAT_I8_UNORM:
-			/* TODO: we can actually swizzle these formats on nv40, we
-				are just preserving the pre-unification behavior.
-				The whole 2D code is going to be rewritten anyway. */
-			if(nvfx_screen(pscreen)->is_nv4x) {
-				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-				break;
-			}
-		/* TODO: Figure out which formats can be swizzled */
-		case PIPE_FORMAT_B8G8R8A8_UNORM:
-		case PIPE_FORMAT_B8G8R8X8_UNORM:
-		case PIPE_FORMAT_R16_SNORM:
-		{
-			if (no_swizzle)
-				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-			break;
-		}
-		default:
-			mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-		}
-	}
+        pipe_reference_init(&mt->base.base.reference, 1);
+        mt->base.base.screen = pscreen;
 
-	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
-	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
-	 * This also happens for small mipmaps of large textures. */
-	if (pt->bind & PIPE_BIND_RENDER_TARGET &&
-	    util_format_get_stride(pt->format, pt->width0) < 64)
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+        // set this to the actual capabilities, we use it to decide whether to use the 3D engine for copies
+        // TODO: is this the correct way to use Gallium?
+        mt->base.base.bind = pt->bind | PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_DEPTH_STENCIL;
 
-	nvfx_miptree_layout(mt);
+        // on our current driver (and the driver too), format support does not depend on geometry, so don't bother computing it
+        // TODO: may want to revisit this
+        if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_RENDER_TARGET, 0))
+                mt->base.base.bind &=~ PIPE_BIND_RENDER_TARGET;
+        if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_SAMPLER_VIEW, 0))
+                mt->base.base.bind &=~ PIPE_BIND_SAMPLER_VIEW;
+        if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_DEPTH_STENCIL, 0))
+                mt->base.base.bind &=~ PIPE_BIND_DEPTH_STENCIL;
 
-	mt->base.bo = nouveau_screen_bo_new(pscreen, 256,
-            pt->usage, pt->bind, mt->total_size);
-	if (!mt->base.bo) {
-		FREE(mt);
-		return NULL;
-	}
-	return &mt->base.base;
+        return mt;
 }
 
 
-
-
 struct pipe_resource *
-nvfx_miptree_from_handle(struct pipe_screen *pscreen,
-			 const struct pipe_resource *template,
-			 struct winsys_handle *whandle)
+nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt)
 {
-	struct nvfx_miptree *mt;
-	unsigned stride;
+	struct nvfx_miptree* mt = nvfx_miptree_create_skeleton(pscreen, pt);
+        unsigned size;
+	nvfx_miptree_choose_format(mt);
 
-	/* Only supports 2D, non-mipmapped textures for the moment */
-	if (template->target != PIPE_TEXTURE_2D ||
-	    template->last_level != 0 ||
-	    template->depth0 != 1)
-		return NULL;
+        size = nvfx_miptree_layout(mt);
 
-	mt = CALLOC_STRUCT(nvfx_miptree);
-	if (!mt)
-		return NULL;
+	mt->base.bo = nouveau_screen_bo_new(pscreen, 256, pt->usage, pt->bind, size);
 
-	mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
-	if (mt->base.bo == NULL) {
+	if (!mt->base.bo) {
 		FREE(mt);
 		return NULL;
 	}
-
-	mt->base.base = *template;
-	mt->base.vtbl = &nvfx_miptree_vtbl;
-	pipe_reference_init(&mt->base.base.reference, 1);
-	mt->base.base.screen = pscreen;
-	mt->level[0].pitch = stride;
-	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
-
-	/* Assume whoever created this buffer expects it to be linear for now */
-	mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-
-	/* XXX: Need to adjust bo refcount??
-	 */
-	/* nouveau_bo_ref(bo, &mt->base.bo); */
 	return &mt->base.base;
 }
 
+// TODO: redo this, just calling miptree_layout
+struct pipe_resource *
+nvfx_miptree_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *template, struct winsys_handle *whandle)
+{
+        struct nvfx_miptree* mt = nvfx_miptree_create_skeleton(pscreen, template);
+        unsigned stride;
+        if(whandle->stride) {
+		mt->linear_pitch = whandle->stride;
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+        } else
+		nvfx_miptree_choose_format(mt);
 
+        nvfx_miptree_layout(mt);
 
+        mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
+        if (mt->base.bo == NULL) {
+                FREE(mt);
+                return NULL;
+        }
+        return &mt->base.base;
+}
 
-
-/* Surface helpers, not strictly required to implement the resource vtbl:
- */
 struct pipe_surface *
 nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
 			 unsigned face, unsigned level, unsigned zslice,
 			 unsigned flags)
 {
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
-	struct nv04_surface *ns;
-
-	ns = CALLOC_STRUCT(nv04_surface);
-	if (!ns)
-		return NULL;
-	pipe_resource_reference(&ns->base.texture, pt);
-	ns->base.format = pt->format;
-	ns->base.width = u_minify(pt->width0, level);
-	ns->base.height = u_minify(pt->height0, level);
-	ns->base.usage = flags;
-	pipe_reference_init(&ns->base.reference, 1);
-	ns->base.face = face;
-	ns->base.level = level;
-	ns->base.zslice = zslice;
-	ns->pitch = mt->level[level].pitch;
-
-	if (pt->target == PIPE_TEXTURE_CUBE) {
-		ns->base.offset = mt->level[level].image_offset[face];
-	} else
-	if (pt->target == PIPE_TEXTURE_3D) {
-		ns->base.offset = mt->level[level].image_offset[zslice];
-	} else {
-		ns->base.offset = mt->level[level].image_offset[0];
-	}
-
-	/* create a linear temporary that we can render into if
-	 * necessary.
-	 *
-	 * Note that ns->pitch is always a multiple of 64 for linear
-	 * surfaces and swizzled surfaces are POT, so ns->pitch & 63
-	 * is equivalent to (ns->pitch < 64 && swizzled)
-	 */
-
-	if ((ns->pitch & 63) && 
-	    (ns->base.usage & PIPE_BIND_RENDER_TARGET))
-	{
-		struct nv04_surface_2d* eng2d  =
-			((struct nvfx_screen*)pscreen)->eng2d;
-
-		ns = nv04_surface_wrap_for_render(pscreen, eng2d, ns);
+	struct nvfx_miptree* mt = (struct nvfx_miptree*)pt;
+	struct nvfx_surface *ns;
+
+	ns = (struct nvfx_surface*)util_surfaces_get(&mt->surfaces, sizeof(struct nvfx_surface), pscreen, pt, face, level, zslice, flags);
+	if(ns->base.base.offset == ~0) {
+		util_dirty_surface_init(&ns->base);
+		ns->pitch = nvfx_subresource_pitch(pt, level);
+		ns->base.base.offset = nvfx_subresource_offset(pt, face, level, zslice);
 	}
 
-	return &ns->base;
+	return &ns->base.base;
 }
 
 void
 nvfx_miptree_surface_del(struct pipe_surface *ps)
 {
-	struct nv04_surface* ns = (struct nv04_surface*)ps;
-	if(ns->backing)
+	struct nvfx_surface* ns = (struct nvfx_surface*)ps;
+
+	if(!ns->temp)
 	{
-		struct nvfx_screen* screen = (struct nvfx_screen*)ps->texture->screen;
-		if(1 /*ns->backing->base.usage & PIPE_BIND_BLIT_DESTINATION*/)
-			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
-		nvfx_miptree_surface_del(&ns->backing->base);
+		util_surfaces_detach(&((struct nvfx_miptree*)ps->texture)->surfaces, ps);
+		pipe_resource_reference(&ps->texture, 0);
+		FREE(ps);
 	}
-
-	pipe_resource_reference(&ps->texture, NULL);
-	FREE(ps);
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_push.c b/src/gallium/drivers/nvfx/nvfx_push.c
new file mode 100644
index 0000000000..ffe7e98357
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_push.c
@@ -0,0 +1,414 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_split_prim.h"
+#include "translate/translate.h"
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+
+struct push_context {
+	struct nouveau_channel* chan;
+
+	void *idxbuf;
+	int32_t idxbias;
+
+	float edgeflag;
+	int edgeflag_attr;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
+
+	struct translate* translate;
+};
+
+static void
+emit_edgeflag(void *priv, boolean enabled)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+
+	OUT_RING(chan, RING_3D(NV34TCL_EDGEFLAG_ENABLE, 1));
+	OUT_RING(chan, enabled ? 1 : 0);
+}
+
+static void
+emit_vertices_lookup8(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint8_t* elts = (uint8_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts8(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup16(void *priv, unsigned start, unsigned count)
+{
+	struct push_context *ctx = priv;
+        uint16_t* elts = (uint16_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts16(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup32(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint32_t* elts = (uint32_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+
+        while(count)
+        {
+		unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+		unsigned length = push * ctx->vertex_length;
+
+		OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+		ctx->translate->run(ctx->translate, start, push, 0, ctx->chan->cur);
+		ctx->chan->cur += length;
+
+		count -= push;
+		start += push;
+        }
+}
+
+static void
+emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	unsigned nr = (vc & 0xff);
+	if (nr) {
+		OUT_RING(chan, RING_3D(reg, 1));
+		OUT_RING  (chan, ((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	nr = vc >> 8;
+	while (nr) {
+		unsigned push = nr > 2047 ? 2047 : nr;
+
+		nr -= push;
+
+		OUT_RING(chan, RING_3D_NI(reg, push));
+		while (push--) {
+			OUT_RING(chan, ((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+}
+
+static void
+emit_ib_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_INDEX_BATCH);
+}
+
+static void
+emit_vb_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_VERTEX_BATCH);
+}
+
+static INLINE void
+emit_elt8(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt16(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt32(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	while (vc) {
+		unsigned push = MIN2(vc, 2047);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
+		assert(AVAIL_RING(chan) >= push);
+		if(idxbias)
+		{
+			for(unsigned i = 0; i < push; ++i)
+				OUT_RING(chan, elts[i] + idxbias);
+		}
+		else
+			OUT_RINGp(chan, elts, push);
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+void
+nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct push_context ctx;
+	struct util_split_prim s;
+	unsigned instances_left = info->instance_count;
+	int vtx_value;
+	unsigned hw_mode = nvgl_primitive(info->mode);
+	int i;
+	struct
+	{
+		uint8_t* map;
+		unsigned step;
+	} per_instance[16];
+	unsigned p_overhead = 64 /* magic fix */
+			+ 4 /* begin/end */
+			+ 4; /* potential edgeflag enable/disable */
+
+	ctx.chan = nvfx->screen->base.channel;
+	ctx.translate = nvfx->vtxelt->translate;
+	ctx.idxbuf = NULL;
+	ctx.vertex_length = nvfx->vtxelt->vertex_length;
+	ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet;
+	ctx.edgeflag = 0.5f;
+	// TODO: figure out if we really want to handle this, and do so in that case
+	ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in;
+
+	if(!nvfx->use_vertex_buffers)
+	{
+		for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset;
+			if(info->indexed)
+				data += info->index_bias * vb->stride;
+			ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+		}
+
+		if(ctx.edgeflag_attr < 16)
+			vtx_value = -(ctx.vertex_length + 3);  /* vertex data and edgeflag header and value */
+		else
+		{
+			p_overhead += 1; /* initial vertex_data header */
+			vtx_value = -ctx.vertex_length;  /* vertex data and edgeflag header and value */
+		}
+
+		if (info->indexed) {
+			// XXX: this case and is broken and probably need a new VTX_ATTR push path
+			if (nvfx->idxbuf.index_size == 1)
+				s.emit = emit_vertices_lookup8;
+			else if (nvfx->idxbuf.index_size == 2)
+				s.emit = emit_vertices_lookup16;
+			else
+				s.emit = emit_vertices_lookup32;
+		} else
+			s.emit = emit_vertices;
+	}
+	else
+	{
+		if(!info->indexed || nvfx->use_index_buffer)
+		{
+			s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges;
+			p_overhead += 3;
+			vtx_value = 0;
+		}
+		else if (nvfx->idxbuf.index_size == 4)
+		{
+			s.emit = emit_elt32;
+			p_overhead += 1;
+			vtx_value = 8;
+		}
+		else
+		{
+			s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8;
+			p_overhead += 3;
+			vtx_value = 7;
+		}
+	}
+
+	ctx.idxbias = info->index_bias;
+	if(nvfx->use_vertex_buffers)
+		ctx.idxbias -= nvfx->base_vertex;
+
+	/* map index buffer, if present */
+	if (info->indexed && !nvfx->use_index_buffer)
+		ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;
+
+	s.priv = &ctx;
+	s.edge = emit_edgeflag;
+
+	for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+	{
+		struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+		float v[4];
+		per_instance[i].step = info->start_instance % ve->instance_divisor;
+		per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset;
+
+		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+
+		WAIT_RING(chan, 5);
+		nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+	}
+
+	/* per-instance loop */
+	while (instances_left--) {
+		int max_verts;
+		boolean done;
+
+		util_split_prim_init(&s, info->mode, info->start, info->count);
+		nvfx_state_emit(nvfx);
+		for(;;) {
+			max_verts  = AVAIL_RING(chan);
+			max_verts -= p_overhead;
+
+			/* if vtx_value < 0, each vertex is -vtx_value words long
+			 * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation)
+			 */
+			if(vtx_value < 0)
+			{
+				max_verts /= -vtx_value;
+				max_verts -= (max_verts >> 10); /* vertex data headers */
+			}
+			else
+			{
+				if(max_verts >= (1 << 23)) /* avoid overflow here */
+					max_verts = (1 << 23);
+				max_verts = (max_verts * 255) >> vtx_value;
+			}
+
+			//printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts);
+
+			if(max_verts >= 16)
+			{
+				/* XXX: any command a lot of times seems to (mostly) fix corruption that would otherwise happen */
+				/* this seems to cause issues on nv3x, and also be unneeded there */
+				if(nvfx->is_nv4x)
+				{
+					int i;
+					for(i = 0; i < 32; ++i)
+					{
+						OUT_RING(chan, RING_3D(0x1dac, 1));
+						OUT_RING(chan, 0);
+					}
+				}
+
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, hw_mode);
+				done = util_split_prim_next(&s, max_verts);
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, 0);
+
+				if(done)
+					break;
+			}
+
+			FIRE_RING(chan);
+			nvfx_state_emit(nvfx);
+		}
+
+		/* set data for the next instance, if any changed */
+		for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+		{
+			struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+
+			if(++per_instance[i].step == ve->instance_divisor)
+			{
+				float v[4];
+				per_instance[i].map += vb->stride;
+				per_instance[i].step = 0;
+
+				nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+				WAIT_RING(chan, 5);
+				nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+			}
+		}
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.c b/src/gallium/drivers/nvfx/nvfx_resource.c
index 10cdeed2a3..39ae893f1b 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.c
+++ b/src/gallium/drivers/nvfx/nvfx_resource.c
@@ -1,23 +1,15 @@
 
 #include "pipe/p_context.h"
+#include "util/u_staging.h"
 #include "nvfx_resource.h"
 #include "nouveau/nouveau_screen.h"
 
-
-/* This doesn't look quite right - this query is supposed to ask
- * whether the particular context has references to the resource in
- * any unflushed rendering command buffer, and hence requires a
- * pipe->flush() for serializing some modification to that resource.
- *
- * This seems to be answering the question of whether the resource is
- * currently on hardware.
- */
 static unsigned int
 nvfx_resource_is_referenced(struct pipe_context *pipe,
-			    struct pipe_resource *resource,
+			    struct pipe_resource *pr,
 			    unsigned face, unsigned level)
 {
-	return nouveau_reference_flags(nvfx_resource(resource)->bo);
+	return !!nouveau_reference_flags(nvfx_resource(pr)->bo);
 }
 
 static struct pipe_resource *
@@ -30,6 +22,15 @@ nvfx_resource_create(struct pipe_screen *screen,
 		return nvfx_miptree_create(screen, template);
 }
 
+static void
+nvfx_resource_destroy(struct pipe_screen *screen, struct pipe_resource *pr)
+{
+	if (pr->target == PIPE_BUFFER)
+		return nvfx_buffer_destroy(screen, pr);
+	else
+		return nvfx_miptree_destroy(screen, pr);
+}
+
 static struct pipe_resource *
 nvfx_resource_from_handle(struct pipe_screen * screen,
 			  const struct pipe_resource *template,
@@ -41,15 +42,22 @@ nvfx_resource_from_handle(struct pipe_screen * screen,
 		return nvfx_miptree_from_handle(screen, template, whandle);
 }
 
+static boolean
+nvfx_resource_get_handle(struct pipe_screen *pscreen,
+                        struct pipe_resource *pr,
+                        struct winsys_handle *whandle)
+{
+	struct nvfx_resource* res = (struct nvfx_resource*)pr;
+
+	if (!res || !res->bo)
+		return FALSE;
+
+	return nouveau_screen_bo_get_handle(pscreen, res->bo, nvfx_subresource_pitch(pr, 0), whandle);
+}
+
 void
 nvfx_init_resource_functions(struct pipe_context *pipe)
 {
-	pipe->get_transfer = u_get_transfer_vtbl;
-	pipe->transfer_map = u_transfer_map_vtbl;
-	pipe->transfer_flush_region = u_transfer_flush_region_vtbl;
-	pipe->transfer_unmap = u_transfer_unmap_vtbl;
-	pipe->transfer_destroy = u_transfer_destroy_vtbl;
-	pipe->transfer_inline_write = u_transfer_inline_write_vtbl;
 	pipe->is_resource_referenced = nvfx_resource_is_referenced;
 }
 
@@ -58,10 +66,10 @@ nvfx_screen_init_resource_functions(struct pipe_screen *pscreen)
 {
 	pscreen->resource_create = nvfx_resource_create;
 	pscreen->resource_from_handle = nvfx_resource_from_handle;
-	pscreen->resource_get_handle = u_resource_get_handle_vtbl;
-	pscreen->resource_destroy = u_resource_destroy_vtbl;
+	pscreen->resource_get_handle = nvfx_resource_get_handle;
+	pscreen->resource_destroy = nvfx_resource_destroy;
 	pscreen->user_buffer_create = nvfx_user_buffer_create;
-   
+
 	pscreen->get_tex_surface = nvfx_miptree_surface_new;
 	pscreen->tex_surface_destroy = nvfx_miptree_surface_del;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.h b/src/gallium/drivers/nvfx/nvfx_resource.h
index a68c14cf3f..583be4de2a 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.h
+++ b/src/gallium/drivers/nvfx/nvfx_resource.h
@@ -1,44 +1,82 @@
-
 #ifndef NVFX_RESOURCE_H
 #define NVFX_RESOURCE_H
 
 #include "util/u_transfer.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_double_list.h"
+#include "util/u_surfaces.h"
+#include "util/u_dirty_surfaces.h"
+#include <nouveau/nouveau_bo.h>
 
 struct pipe_resource;
-struct nouveau_bo;
-
+struct nv04_region;
 
-/* This gets further specialized into either buffer or texture
- * structures.  In the future we'll want to remove much of that
- * distinction, but for now try to keep as close to the existing code
- * as possible and use the vtbl struct to choose between the two
- * underlying implementations.
- */
 struct nvfx_resource {
 	struct pipe_resource base;
-	struct u_resource_vtbl *vtbl;
 	struct nouveau_bo *bo;
 };
 
+static INLINE
+struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
+{
+	return (struct nvfx_resource *)resource;
+}
+
+#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+#define NVFX_RESOURCE_FLAG_USER (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
+
+/* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
+static INLINE boolean
+nvfx_resource_mapped_by_gpu(struct pipe_resource *resource)
+{
+   return nvfx_resource(resource)->bo->handle;
+}
+
+/* is resource in VRAM? */
+static inline int
+nvfx_resource_on_gpu(struct pipe_resource* pr)
+{
+#if 0
+	// a compiler error here means you need to apply libdrm-nouveau-add-domain.patch to libdrm
+	// TODO: return FALSE if not VRAM and on a PCI-E system
+	return ((struct nvfx_resource*)pr)->bo->domain & (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART);
+#else
+	return TRUE;
+#endif
+}
+
 #define NVFX_MAX_TEXTURE_LEVELS  16
 
+/* We have the following invariants for render temporaries
+ *
+ * 1. Render temporaries are always linear
+ * 2. Render temporaries are always up to date
+ * 3. Currently, render temporaries are destroyed when the resource is used for sampling, but kept for any other use
+ *
+ * Also, we do NOT flush temporaries on any pipe->flush().
+ * This is fine, as long as scanout targets and shared resources never need temps.
+ *
+ * TODO: we may want to also support swizzled temporaries to improve performance in some cases.
+ */
+
 struct nvfx_miptree {
-	struct nvfx_resource base;
-	uint total_size;
+        struct nvfx_resource base;
 
-	struct {
-		uint pitch;
-		uint *image_offset;
-	} level[NVFX_MAX_TEXTURE_LEVELS];
+        unsigned linear_pitch; /* for linear textures, 0 for swizzled and compressed textures with level-dependent minimal pitch */
+        unsigned face_size; /* 128-byte aligned face/total size */
+        unsigned level_offset[NVFX_MAX_TEXTURE_LEVELS];
 
-	unsigned image_nr;
+        struct util_surfaces surfaces;
+        struct util_dirty_surfaces dirty_surfaces;
 };
 
-static INLINE 
-struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
-{
-	return (struct nvfx_resource *)resource;
-}
+struct nvfx_surface {
+	struct util_dirty_surface base;
+	unsigned pitch;
+
+	struct nvfx_miptree* temp;
+};
 
 static INLINE struct nouveau_bo *
 nvfx_surface_buffer(struct pipe_surface *surf)
@@ -48,6 +86,12 @@ nvfx_surface_buffer(struct pipe_surface *surf)
 	return mt->bo;
 }
 
+static INLINE struct util_dirty_surfaces*
+nvfx_surface_get_dirty_surfaces(struct pipe_surface* surf)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)surf->texture;
+	return &mt->dirty_surfaces;
+}
 
 void
 nvfx_init_resource_functions(struct pipe_context *pipe);
@@ -62,30 +106,118 @@ nvfx_screen_init_resource_functions(struct pipe_screen *pscreen);
 struct pipe_resource *
 nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt);
 
+void
+nvfx_miptree_destroy(struct pipe_screen *pscreen,
+                     struct pipe_resource *presource);
+
 struct pipe_resource *
 nvfx_miptree_from_handle(struct pipe_screen *pscreen,
 			 const struct pipe_resource *template,
 			 struct winsys_handle *whandle);
 
+void
+nvfx_miptree_surface_del(struct pipe_surface *ps);
+
+struct pipe_surface *
+nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags);
+
+/* only for miptrees, don't use for buffers */
+
+/* NOTE: for swizzled 3D textures, this just returns the offset of the mipmap level */
+static inline unsigned
+nvfx_subresource_offset(struct pipe_resource* pt, unsigned face, unsigned level, unsigned zslice)
+{
+	if(pt->target == PIPE_BUFFER)
+		return 0;
+	else
+	{
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+
+		unsigned offset = mt->level_offset[level];
+		if (pt->target == PIPE_TEXTURE_CUBE)
+			offset += mt->face_size * face;
+		else if (pt->target == PIPE_TEXTURE_3D && mt->linear_pitch)
+			offset += zslice * util_format_get_2d_size(pt->format, (mt->linear_pitch ? mt->linear_pitch : util_format_get_stride(pt->format, u_minify(pt->width0, level))),  u_minify(pt->height0, level));
+		return offset;
+	}
+}
+
+static inline unsigned
+nvfx_subresource_pitch(struct pipe_resource* pt, unsigned level)
+{
+	if(pt->target == PIPE_BUFFER)
+		return ((struct nvfx_resource*)pt)->bo->size;
+	else
+	{
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+
+		if(mt->linear_pitch)
+			return mt->linear_pitch;
+		else
+			return util_format_get_stride(pt->format, u_minify(pt->width0, level));
+	}
+}
+
+void
+nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf);
+
+void
+nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf);
+
+struct nvfx_buffer
+{
+	struct nvfx_resource base;
+	uint8_t* data;
+	unsigned size;
+
+	/* the range of data not yet uploaded to the GPU bo */
+	unsigned dirty_begin;
+	unsigned dirty_end;
+
+	/* whether all transfers were unsynchronized */
+	boolean dirty_unsynchronized;
+
+	/* whether it would have been profitable to upload
+	 * the latest updated data to the GPU immediately */
+	boolean last_update_static;
+
+	/* how many bytes we need to draw before we deem
+	 * the buffer to be static
+	 */
+	long long bytes_to_draw_until_static;
+};
+
+static inline struct nvfx_buffer* nvfx_buffer(struct pipe_resource* pr)
+{
+	return (struct nvfx_buffer*)pr;
+}
+
+/* this is an heuristic to determine whether we are better off uploading the
+ * buffer to the GPU, or just continuing pushing it on the FIFO
+ */
+static inline boolean nvfx_buffer_seems_static(struct nvfx_buffer* buffer)
+{
+	return buffer->last_update_static
+		|| buffer->bytes_to_draw_until_static < 0;
+}
+
 struct pipe_resource *
 nvfx_buffer_create(struct pipe_screen *pscreen,
 		   const struct pipe_resource *template);
 
+void
+nvfx_buffer_destroy(struct pipe_screen *pscreen,
+                    struct pipe_resource *presource);
+
 struct pipe_resource *
 nvfx_user_buffer_create(struct pipe_screen *screen,
 			void *ptr,
 			unsigned bytes,
 			unsigned usage);
 
-
-
 void
-nvfx_miptree_surface_del(struct pipe_surface *ps);
-
-struct pipe_surface *
-nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
-			 unsigned face, unsigned level, unsigned zslice,
-			 unsigned flags);
-
+nvfx_buffer_upload(struct nvfx_buffer* buffer);
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index f2525ccb38..65ca265d45 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -8,23 +8,12 @@
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
 #include "nvfx_resource.h"
+#include "nvfx_tex.h"
 
 #define NV30TCL_CHIPSET_3X_MASK 0x00000003
 #define NV34TCL_CHIPSET_3X_MASK 0x00000010
 #define NV35TCL_CHIPSET_3X_MASK 0x000001e0
 
-/* FIXME: It seems I should not include directly ../../winsys/drm/nouveau/drm/nouveau_drm_api.h
-* to get the pointer to the context front buffer, so I copied nouveau_winsys here.
-* nv30_screen_surface_format_supported() can then use it to enforce creating fbo
-* with same number of bits everywhere.
-*/
-struct nouveau_winsys {
-	struct pipe_winsys base;
-
-	struct pipe_screen *pscreen;
-
-	struct pipe_surface *front;
-};
 #define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
 #define NV4X_GRCLASS4497_CHIPSETS 0x00005450
 #define NV6X_GRCLASS4497_CHIPSETS 0x00000088
@@ -43,7 +32,7 @@ nvfx_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TWO_SIDED_STENCIL:
 		return 1;
 	case PIPE_CAP_GLSL:
-		return 0;
+		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
@@ -162,77 +151,74 @@ nvfx_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_cap param)
 }
 
 static boolean
-nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
+nvfx_screen_is_format_supported(struct pipe_screen *pscreen,
 				     enum pipe_format format,
 				     enum pipe_texture_target target,
 				     unsigned sample_count,
-				     unsigned tex_usage, unsigned geom_flags)
+				     unsigned bind, unsigned geom_flags)
 {
 	struct nvfx_screen *screen = nvfx_screen(pscreen);
-	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
 
 	 if (sample_count > 1)
 		return FALSE;
 
-	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+	if (bind & PIPE_BIND_RENDER_TARGET) {
 		switch (format) {
 		case PIPE_FORMAT_B8G8R8A8_UNORM:
 		case PIPE_FORMAT_B8G8R8X8_UNORM:
 		case PIPE_FORMAT_B5G6R5_UNORM:
-			return TRUE;
-		default:
 			break;
+		default:
+			return FALSE;
 		}
-	} else
-	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
+	}
+
+	if (bind & PIPE_BIND_DEPTH_STENCIL) {
 		switch (format) {
 		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
 		case PIPE_FORMAT_X8Z24_UNORM:
-			return TRUE;
 		case PIPE_FORMAT_Z16_UNORM:
-			/* TODO: this nv30 limitation probably does not exist */
-			if (!screen->is_nv4x && front)
-				return (front->format == PIPE_FORMAT_B5G6R5_UNORM);
-			return TRUE;
-		default:
 			break;
+		default:
+			return FALSE;
 		}
-	} else {
-		switch (format) {
-		if (tex_usage & PIPE_BIND_SAMPLER_VIEW) {
-			switch (format) {
-			case PIPE_FORMAT_DXT1_RGB:
-			case PIPE_FORMAT_DXT1_RGBA:
-			case PIPE_FORMAT_DXT3_RGBA:
-			case PIPE_FORMAT_DXT5_RGBA:
-				return util_format_s3tc_enabled;
-			default:
-				break;
-			}
+	}
+
+	if (bind & PIPE_BIND_SAMPLER_VIEW) {
+		struct nvfx_texture_format* tf = &nvfx_texture_formats[format];
+		if(util_format_is_s3tc(format) && !util_format_s3tc_enabled)
+			return FALSE;
+
+		if(screen->is_nv4x)
+		{
+			if(tf->fmt[4] < 0)
+				return FALSE;
 		}
-		case PIPE_FORMAT_B8G8R8A8_UNORM:
-		case PIPE_FORMAT_B8G8R8X8_UNORM:
-		case PIPE_FORMAT_B5G5R5A1_UNORM:
-		case PIPE_FORMAT_B4G4R4A4_UNORM:
-		case PIPE_FORMAT_B5G6R5_UNORM:
-		case PIPE_FORMAT_L8_UNORM:
-		case PIPE_FORMAT_A8_UNORM:
-		case PIPE_FORMAT_I8_UNORM:
-		case PIPE_FORMAT_L8A8_UNORM:
-		case PIPE_FORMAT_Z16_UNORM:
-		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-			return TRUE;
-		/* TODO: does nv30 support this? */
-		case PIPE_FORMAT_R16_SNORM:
-			return !!screen->is_nv4x;
-		default:
-			break;
+		else
+		{
+			if(tf->fmt[0] < 0)
+				return FALSE;
 		}
 	}
 
-	return FALSE;
-}
+	// note that we do actually support everything through translate
+	if (bind & PIPE_BIND_VERTEX_BUFFER) {
+		unsigned type = nvfx_vertex_formats[format];
+		if(!type)
+			return FALSE;
+	}
+
+	if (bind & PIPE_BIND_INDEX_BUFFER) {
+		// 8-bit indices supported, but not in hardware index buffer
+		if(format != PIPE_FORMAT_R16_USCALED && format != PIPE_FORMAT_R32_USCALED)
+			return FALSE;
+	}
+
+	if(bind & PIPE_BIND_STREAM_OUTPUT)
+		return FALSE;
 
+	return TRUE;
+}
 
 static void
 nvfx_screen_destroy(struct pipe_screen *pscreen)
@@ -245,7 +231,7 @@ nvfx_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_notifier_free(&screen->query);
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->eng3d);
-	nv04_surface_2d_takedown(&screen->eng2d);
+	nvfx_screen_surface_takedown(pscreen);
 
 	nouveau_screen_fini(&screen->base);
 
@@ -374,6 +360,14 @@ nvfx_screen_get_vertex_buffer_flags(struct nvfx_screen* screen)
 	return vram_hack ? NOUVEAU_BO_VRAM : NOUVEAU_BO_GART;
 }
 
+static void nvfx_channel_flush_notify(struct nouveau_channel* chan)
+{
+	struct nvfx_screen* screen = chan->user_private;
+	struct nvfx_context* nvfx = screen->cur_ctx;
+	if(nvfx)
+		nvfx->relocs_needed = NVFX_RELOCATE_ALL;
+}
+
 struct pipe_screen *
 nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
@@ -395,12 +389,15 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 	chan = screen->base.channel;
+	screen->cur_ctx = NULL;
+	chan->user_private = screen;
+	chan->flush_notify = nvfx_channel_flush_notify;
 
 	pscreen->winsys = ws;
 	pscreen->destroy = nvfx_screen_destroy;
 	pscreen->get_param = nvfx_screen_get_param;
 	pscreen->get_paramf = nvfx_screen_get_paramf;
-	pscreen->is_format_supported = nvfx_screen_surface_format_supported;
+	pscreen->is_format_supported = nvfx_screen_is_format_supported;
 	pscreen->context_create = nvfx_create;
 
 	switch (dev->chipset & 0xf0) {
@@ -432,6 +429,11 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE);
+	screen->trace_draw = debug_get_bool_option("NVFX_TRACE_DRAW", FALSE);
+
+	screen->buffer_allocation_cost = debug_get_num_option("NVFX_BUFFER_ALLOCATION_COST", 16384);
+	screen->inline_cost_per_hardware_cost = atof(debug_get_option("NVFX_INLINE_COST_PER_HARDWARE_COST", "1.0"));
+	screen->static_reuse_threshold = atof(debug_get_option("NVFX_STATIC_REUSE_THRESHOLD", "2.0"));
 
 	screen->vertex_buffer_reloc_flags = nvfx_screen_get_vertex_buffer_flags(screen);
 
@@ -451,8 +453,7 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* 2D engine setup */
-	screen->eng2d = nv04_surface_2d_init(&screen->base);
-	screen->eng2d->buf = nvfx_surface_buffer;
+	nvfx_screen_surface_init(pscreen);
 
 	/* Notifier for sync purposes */
 	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h
index 5e1c3945ae..1b79235ae0 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.h
+++ b/src/gallium/drivers/nvfx/nvfx_screen.h
@@ -1,11 +1,11 @@
 #ifndef __NVFX_SCREEN_H__
 #define __NVFX_SCREEN_H__
 
+#include "pipe/p_compiler.h"
 #include "util/u_double_list.h"
 #include "nouveau/nouveau_screen.h"
-#include "nv04_surface_2d.h"
 
-struct nvfx_context;
+struct pipe_screen;
 
 struct nvfx_screen {
 	struct nouveau_screen base;
@@ -16,11 +16,11 @@ struct nvfx_screen {
 
 	unsigned is_nv4x; /* either 0 or ~0 */
 	boolean force_swtnl;
+	boolean trace_draw;
 	unsigned vertex_buffer_reloc_flags;
 	unsigned index_buffer_reloc_flags;
 
 	/* HW graphics objects */
-	struct nv04_surface_2d *eng2d;
 	struct nouveau_grobj *eng3d;
 	struct nouveau_notifier *sync;
 
@@ -32,6 +32,20 @@ struct nvfx_screen {
 	/* Vtxprog resources */
 	struct nouveau_resource *vp_exec_heap;
 	struct nouveau_resource *vp_data_heap;
+
+	struct nv04_2d_context* eng2d;
+
+	/* Once the amount of bytes drawn from the buffer reaches the updated size times this value,
+	 * we will assume that the buffer will be drawn an huge number of times before the
+	 * next modification
+	 */
+	float static_reuse_threshold;
+
+	/* Cost of allocating a buffer in terms of the cost of copying a byte to an hardware buffer */
+	unsigned buffer_allocation_cost;
+
+	/* inline_cost/hardware_cost conversion ration */
+	float inline_cost_per_hardware_cost;
 };
 
 static INLINE struct nvfx_screen *
@@ -40,4 +54,7 @@ nvfx_screen(struct pipe_screen *screen)
 	return (struct nvfx_screen *)screen;
 }
 
+int nvfx_screen_surface_init(struct pipe_screen *pscreen);
+void nvfx_screen_surface_takedown(struct pipe_screen *pscreen);
+
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
index 50830b3916..35006eec3d 100644
--- a/src/gallium/drivers/nvfx/nvfx_shader.h
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -1,6 +1,12 @@
 #ifndef __NVFX_SHADER_H__
 #define __NVFX_SHADER_H__
 
+#include <stdint.h>
+
+#include "pipe/p_compiler.h"
+
+#define NVFX_SWZ_IDENTITY ((3 << 6) | (2 << 4) | (1 << 2) | (0 << 0))
+
 /* this will resolve to either the NV30 or the NV40 version
  * depending on the current hardware */
 /* unusual, but very fast and compact method */
@@ -71,11 +77,58 @@
 /*
  * Each fragment program opcode appears to be comprised of 4 32-bit values.
  *
- *   0 - Opcode, output reg/mask, ATTRIB source
- *   1 - Source 0
- *   2 - Source 1
- *   3 - Source 2
+ * 0: OPDEST
+ * 	0: program end
+ * 	1-6: destination register
+ * 	7: destination register is fp16?? (use for outputs)
+ * 	8: set condition code
+ * 	9: writemask x
+ *  	10: writemask y
+ *  	11: writemask z
+ *  	12: writemask w
+ *  	13-16: source attribute register number (e.g. COL0)
+ *  	17-20: texture unit number
+ *  	21: expand value on texture operation (x -> 2x - 1)
+ *  	22-23: precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = s0.8 fixed (nv40-only))
+ * 	24-29: opcode
+ * 	30: no destination
+ * 	31: saturate
+ * 1 - SRC0
+ * 	0-17: see common source fields
+ * 	18: execute if condition code less
+ * 	19: execute if condition code equal
+ * 	20: execute if condition code greater
+ * 	21-22: condition code swizzle x source component
+ * 	23-24: condition code swizzle y source component
+ * 	25-26: condition code swizzle z source component
+ * 	27-28: condition code swizzle w source component
+ * 	29: source 0 absolute
+ * 	30: always 0 in renouveau tests
+ * 	31: always 0 in renouveau tests
+ * 2 - SRC1
+ * 	0-17: see common source fields
+ * 	18: source 1 absolute
+ * 	19-20: input precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = ???
+ * 	21-27: always 0 in renouveau tests
+ * 	28-30: scale (0 = 1x, 1 = 2x, 2 = 4x, 3 = 8x, 4 = ???, 5, = 1/2, 6 = 1/4, 7 = 1/8)
+ * 	31: opcode is branch
+ * 3 - SRC2
+ * 	0-17: see common source fields
+ * 	18: source 2 absolute
+ * 	19-29: address register displacement
+ * 	30: use index register
+ * 	31: disable perspective-correct interpolation?
  *
+* Common fields of 0, 1, 2 - SRC
+ * 	0-1: source register type (0 = temp, 1 = input, 2 = immediate, 3 = ???)
+ * 	2-7: source temp register index
+ * 	8: source register is fp16??
+ * 	9-10: source swizzle x source component
+ * 	11-12: source swizzle y source component
+ * 	13-14: source swizzle z source component
+ * 	15-16: source swizzle w source component
+ *	17: negate
+
  * There appears to be no special difference between result regs and temp regs.
  *     result.color == R0.xyzw
  *     result.depth == R1.z
@@ -210,6 +263,7 @@
 
 /* NV40 only fragment program opcodes */
 #define NVFX_FP_OP_OPCODE_TXL_NV40 0x2F
+
 /* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
 #define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
 #define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
@@ -218,10 +272,11 @@
 #define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
 #define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
 
+#define NV40_FP_OP_OUT_NONE         (1 << 30)
 #define NVFX_FP_OP_OUT_SAT          (1 << 31)
 
 /* high order bits of SRC0 */
-#define NVFX_FP_OP_OUT_ABS          (1 << 29)
+#define NVFX_FP_OP_SRC0_ABS          (1 << 29)
 #define NVFX_FP_OP_COND_SWZ_W_SHIFT        27
 #define NVFX_FP_OP_COND_SWZ_W_MASK        (3 << 27)
 #define NVFX_FP_OP_COND_SWZ_Z_SHIFT        25
@@ -254,6 +309,7 @@
 #define NVFX_FP_OP_DST_SCALE_INV_2X                                            5
 #define NVFX_FP_OP_DST_SCALE_INV_4X                                            6
 #define NVFX_FP_OP_DST_SCALE_INV_8X                                            7
+#define NVFX_FP_OP_SRC1_ABS          (1 << 18)
 
 /* SRC1 LOOP */
 #define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
@@ -263,13 +319,13 @@
 #define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
 #define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
 
-/* SRC1 IF */
-#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
-#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+/* SRC1 IF: absolute offset in dwords */
+#define NV40_FP_OP_ELSE_OFFSET_SHIFT                                           0
+#define NV40_FP_OP_ELSE_OFFSET_MASK                             (0x7FFFFFFF << 0)
 
 /* SRC1 CAL */
-#define NV40_FP_OP_IADDR_SHIFT                                                 2
-#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+#define NV40_FP_OP_SUB_OFFSET_SHIFT                                                 0
+#define NV40_FP_OP_SUB_OFFSET_MASK                                   (0x7FFFFFFF << 0)
 
 /* SRC1 REP
  *   I have no idea why there are 3 count values here..  but they
@@ -283,9 +339,9 @@
 #define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
 #define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
 
-/* SRC2 REP/IF */
-#define NV40_FP_OP_END_ID_SHIFT                                                2
-#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+/* SRC2 REP/IF: absolute offset in dwords */
+#define NV40_FP_OP_END_OFFSET_SHIFT                                            0
+#define NV40_FP_OP_END_OFFSET_MASK                              (0x7FFFFFFF << 0)
 
 /* high order bits of SRC2 */
 #define NVFX_FP_OP_INDEX_INPUT          (1 << 30)
@@ -323,6 +379,7 @@
 #define NVFXSR_INPUT	2
 #define NVFXSR_TEMP	3
 #define NVFXSR_CONST	4
+#define NVFXSR_RELOCATED	5
 
 #define NVFX_COND_FL  0
 #define NVFX_COND_LT  1
@@ -352,51 +409,88 @@
 #define NVFX_SWZ_Z 2
 #define NVFX_SWZ_W 3
 
-#define swz(s,x,y,z,w) nvfx_sr_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
-#define neg(s) nvfx_sr_neg((s))
-#define abs(s) nvfx_sr_abs((s))
-#define scale(s,v) nvfx_sr_scale((s), NVFX_FP_OP_DST_SCALE_##v)
+#define swz(s,x,y,z,w) nvfx_src_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
+#define neg(s) nvfx_src_neg((s))
+#define abs(s) nvfx_src_abs((s))
 
-struct nvfx_sreg {
-	int type;
-	int index;
+struct nvfx_reg {
+	uint8_t type;
+	uint32_t index;
+};
 
-	int dst_scale;
+struct nvfx_src {
+	struct nvfx_reg reg;
 
-	int negate;
-	int abs;
-	int swz[4];
+	/* src only */
+	uint8_t negate : 1;
+	uint8_t abs : 1;
+	uint8_t swz[4];
+};
 
-	int cc_update;
-	int cc_update_reg;
-	int cc_test;
-	int cc_test_reg;
-	int cc_swz[4];
+struct nvfx_insn
+{
+	uint8_t op;
+	char scale;
+	int8_t unit;
+	uint8_t mask;
+	uint8_t cc_swz[4];
+
+	uint8_t sat : 1;
+	uint8_t cc_update : 1;
+	uint8_t cc_update_reg : 1;
+	uint8_t cc_test : 3;
+	uint8_t cc_test_reg : 1;
+
+	struct nvfx_reg dst;
+	struct nvfx_src src[3];
 };
 
-static INLINE struct nvfx_sreg
-nvfx_sr(int type, int index)
+static INLINE struct nvfx_insn
+nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
 {
-	struct nvfx_sreg temp = {
-		.type = type,
-		.index = index,
-		.dst_scale = 0,
-		.abs = 0,
-		.negate = 0,
-		.swz = { 0, 1, 2, 3 },
+	struct nvfx_insn insn = {
+		.op = op,
+		.scale = 0,
+		.unit = unit,
+		.sat = sat,
+		.mask = mask,
 		.cc_update = 0,
 		.cc_update_reg = 0,
 		.cc_test = NVFX_COND_TR,
 		.cc_test_reg = 0,
 		.cc_swz = { 0, 1, 2, 3 },
+		.dst = dst,
+		.src = {s0, s1, s2}
+	};
+	return insn;
+}
+
+static INLINE struct nvfx_reg
+nvfx_reg(int type, int index)
+{
+	struct nvfx_reg temp = {
+		.type = type,
+		.index = index,
 	};
 	return temp;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
+static INLINE struct nvfx_src
+nvfx_src(struct nvfx_reg reg)
 {
-	struct nvfx_sreg dst = src;
+	struct nvfx_src temp = {
+		.reg = reg,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nvfx_src
+nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
+{
+	struct nvfx_src dst = src;
 
 	dst.swz[NVFX_SWZ_X] = src.swz[x];
 	dst.swz[NVFX_SWZ_Y] = src.swz[y];
@@ -405,25 +499,23 @@ nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
 	return dst;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_neg(struct nvfx_sreg src)
+static INLINE struct nvfx_src
+nvfx_src_neg(struct nvfx_src src)
 {
 	src.negate = !src.negate;
 	return src;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_abs(struct nvfx_sreg src)
+static INLINE struct nvfx_src
+nvfx_src_abs(struct nvfx_src src)
 {
 	src.abs = 1;
 	return src;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_scale(struct nvfx_sreg src, int scale)
-{
-	src.dst_scale = scale;
-	return src;
-}
+struct nvfx_relocation {
+        unsigned location;
+        unsigned target;
+};
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
index cd58e439d7..5bd7dc07f0 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.c
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -1,6 +1,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
 
 #include "draw/draw_context.h"
 
@@ -81,111 +82,6 @@ nvfx_blend_state_delete(struct pipe_context *pipe, void *hwcso)
 }
 
 static void *
-nvfx_sampler_state_create(struct pipe_context *pipe,
-			  const struct pipe_sampler_state *cso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_sampler_state *ps;
-
-	ps = MALLOC(sizeof(struct nvfx_sampler_state));
-
-	/* on nv30, we use this as an internal flag */
-	ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT;
-	ps->en = 0;
-	ps->filt = nvfx_tex_filter(cso);
-	ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
-		    (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
-		    (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT) |
-		    nvfx_tex_wrap_compare_mode(cso);
-	ps->bcol = nvfx_tex_border_color(cso->border_color);
-
-	if(nvfx->is_nv4x)
-		nv40_sampler_state_init(pipe, ps, cso);
-	else
-		nv30_sampler_state_init(pipe, ps, cso);
-
-	return (void *)ps;
-}
-
-static void
-nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	unsigned unit;
-
-	for (unit = 0; unit < nr; unit++) {
-		nvfx->tex_sampler[unit] = sampler[unit];
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	for (unit = nr; unit < nvfx->nr_samplers; unit++) {
-		nvfx->tex_sampler[unit] = NULL;
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	nvfx->nr_samplers = nr;
-	nvfx->dirty |= NVFX_NEW_SAMPLER;
-}
-
-static void
-nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	FREE(hwcso);
-}
-
-static void
-nvfx_set_fragment_sampler_views(struct pipe_context *pipe,
-				unsigned nr,
-				struct pipe_sampler_view **views)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	unsigned unit;
-
-	for (unit = 0; unit < nr; unit++) {
-		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
-                                            views[unit]);
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	for (unit = nr; unit < nvfx->nr_textures; unit++) {
-		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
-                                            NULL);
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	nvfx->nr_textures = nr;
-	nvfx->dirty |= NVFX_NEW_SAMPLER;
-}
-
-
-static struct pipe_sampler_view *
-nvfx_create_sampler_view(struct pipe_context *pipe,
-			 struct pipe_resource *texture,
-			 const struct pipe_sampler_view *templ)
-{
-	struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
-
-	if (view) {
-		*view = *templ;
-		view->reference.count = 1;
-		view->texture = NULL;
-		pipe_resource_reference(&view->texture, texture);
-		view->context = pipe;
-	}
-
-	return view;
-}
-
-
-static void
-nvfx_sampler_view_destroy(struct pipe_context *pipe,
-			  struct pipe_sampler_view *view)
-{
-	pipe_resource_reference(&view->texture, NULL);
-	FREE(view);
-}
-
-static void *
 nvfx_rasterizer_state_create(struct pipe_context *pipe,
 			     const struct pipe_rasterizer_state *cso)
 {
@@ -195,6 +91,7 @@ nvfx_rasterizer_state_create(struct pipe_context *pipe,
 	/*XXX: ignored:
 	 * 	point_smooth -nohw
 	 * 	multisample
+	 *     sprite_coord_origin
 	 */
 
 	sb_method(sb, NV34TCL_SHADE_MODEL, 1);
@@ -254,19 +151,8 @@ nvfx_rasterizer_state_create(struct pipe_context *pipe,
 		sb_data(sb, fui(cso->offset_units * 2));
 	}
 
-	sb_method(sb, NV34TCL_POINT_SPRITE, 1);
-	if (cso->point_quad_rasterization) {
-		unsigned psctl = (1 << 0), i;
-
-		for (i = 0; i < 8; i++) {
-			if ((cso->sprite_coord_enable >> i) & 1)
-				psctl |= (1 << (8 + i));
-		}
-
-		sb_data(sb, psctl);
-	} else {
-		sb_data(sb, 0);
-	}
+	sb_method(sb, NV34TCL_FLATSHADE_FIRST, 1);
+	sb_data(sb, cso->flatshade_first);
 
 	rsso->pipe = *cso;
 	rsso->sb_len = sb_len(sb, rsso->sb);
@@ -287,11 +173,11 @@ nvfx_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
 			nvfx->draw_dirty |= NVFX_NEW_SCISSOR;
 		}
 
-		if(((struct nvfx_rasterizer_state*)hwcso)->pipe.poly_stipple_enable
-					!= nvfx->rasterizer->pipe.poly_stipple_enable)
+		if(((struct nvfx_rasterizer_state*)hwcso)->pipe.point_quad_rasterization != nvfx->rasterizer->pipe.point_quad_rasterization
+				|| ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_enable != nvfx->rasterizer->pipe.sprite_coord_enable
+				|| ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_mode != nvfx->rasterizer->pipe.sprite_coord_mode)
 		{
-			nvfx->dirty |= NVFX_NEW_STIPPLE;
-			nvfx->draw_dirty |= NVFX_NEW_STIPPLE;
+			nvfx->dirty |= NVFX_NEW_SPRITE;
 		}
 	}
 
@@ -315,10 +201,8 @@ nvfx_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 	struct nvfx_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
 	struct nouveau_statebuf_builder sb = sb_init(zsaso->sb);
 
-	sb_method(sb, NV34TCL_DEPTH_FUNC, 3);
+	sb_method(sb, NV34TCL_DEPTH_FUNC, 1);
 	sb_data  (sb, nvgl_comparison_op(cso->depth.func));
-	sb_data  (sb, cso->depth.writemask ? 1 : 0);
-	sb_data  (sb, cso->depth.enabled ? 1 : 0);
 
 	sb_method(sb, NV34TCL_ALPHA_FUNC_ENABLE, 3);
 	sb_data  (sb, cso->alpha.enabled ? 1 : 0);
@@ -377,76 +261,6 @@ nvfx_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
 	FREE(zsaso);
 }
 
-static void *
-nvfx_vp_state_create(struct pipe_context *pipe,
-		     const struct pipe_shader_state *cso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_vertex_program *vp;
-
-	vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
-	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
-	vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
-
-	return (void *)vp;
-}
-
-static void
-nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->vertprog = hwcso;
-	nvfx->dirty |= NVFX_NEW_VERTPROG;
-	nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
-}
-
-static void
-nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_vertex_program *vp = hwcso;
-
-	draw_delete_vertex_shader(nvfx->draw, vp->draw);
-	nvfx_vertprog_destroy(nvfx, vp);
-	FREE((void*)vp->pipe.tokens);
-	FREE(vp);
-}
-
-static void *
-nvfx_fp_state_create(struct pipe_context *pipe,
-		     const struct pipe_shader_state *cso)
-{
-	struct nvfx_fragment_program *fp;
-
-	fp = CALLOC(1, sizeof(struct nvfx_fragment_program));
-	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
-
-	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
-
-	return (void *)fp;
-}
-
-static void
-nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->fragprog = hwcso;
-	nvfx->dirty |= NVFX_NEW_FRAGPROG;
-}
-
-static void
-nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_fragment_program *fp = hwcso;
-
-	nvfx_fragprog_destroy(nvfx, fp);
-	FREE((void*)fp->pipe.tokens);
-	FREE(fp);
-}
-
 static void
 nvfx_set_blend_color(struct pipe_context *pipe,
 		     const struct pipe_blend_color *bcol)
@@ -507,7 +321,10 @@ nvfx_set_framebuffer_state(struct pipe_context *pipe,
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-	nvfx->framebuffer = *fb;
+	if(fb)
+		util_copy_framebuffer_state(&nvfx->framebuffer, fb);
+	else
+		util_unreference_framebuffer_state(&nvfx->framebuffer);
 	nvfx->dirty |= NVFX_NEW_FB;
 }
 
@@ -542,65 +359,6 @@ nvfx_set_viewport_state(struct pipe_context *pipe,
 	nvfx->draw_dirty |= NVFX_NEW_VIEWPORT;
 }
 
-static void
-nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
-			const struct pipe_vertex_buffer *vb)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	memcpy(nvfx->vtxbuf, vb, sizeof(*vb) * count);
-	nvfx->vtxbuf_nr = count;
-
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
-}
-
-static void
-nvfx_set_index_buffer(struct pipe_context *pipe,
-		      const struct pipe_index_buffer *ib)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	if (ib)
-		memcpy(&nvfx->idxbuf, ib, sizeof(nvfx->idxbuf));
-	else
-		memset(&nvfx->idxbuf, 0, sizeof(nvfx->idxbuf));
-
-	/* TODO make this more like a state */
-}
-
-static void *
-nvfx_vtxelts_state_create(struct pipe_context *pipe,
-			  unsigned num_elements,
-			  const struct pipe_vertex_element *elements)
-{
-	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
-
-	assert(num_elements < 16); /* not doing fallbacks yet */
-	cso->num_elements = num_elements;
-	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
-
-/*	nvfx_vtxelt_construct(cso);*/
-
-	return (void *)cso;
-}
-
-static void
-nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	FREE(hwcso);
-}
-
-static void
-nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->vtxelt = hwcso;
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	/*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/
-}
-
 void
 nvfx_init_state_functions(struct nvfx_context *nvfx)
 {
@@ -608,13 +366,6 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.bind_blend_state = nvfx_blend_state_bind;
 	nvfx->pipe.delete_blend_state = nvfx_blend_state_delete;
 
-	nvfx->pipe.create_sampler_state = nvfx_sampler_state_create;
-	nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind;
-	nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete;
-	nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views;
-        nvfx->pipe.create_sampler_view = nvfx_create_sampler_view;
-        nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy;
-
 	nvfx->pipe.create_rasterizer_state = nvfx_rasterizer_state_create;
 	nvfx->pipe.bind_rasterizer_state = nvfx_rasterizer_state_bind;
 	nvfx->pipe.delete_rasterizer_state = nvfx_rasterizer_state_delete;
@@ -626,14 +377,6 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.delete_depth_stencil_alpha_state =
 		nvfx_depth_stencil_alpha_state_delete;
 
-	nvfx->pipe.create_vs_state = nvfx_vp_state_create;
-	nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
-	nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
-
-	nvfx->pipe.create_fs_state = nvfx_fp_state_create;
-	nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
-	nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
-
 	nvfx->pipe.set_blend_color = nvfx_set_blend_color;
         nvfx->pipe.set_stencil_ref = nvfx_set_stencil_ref;
 	nvfx->pipe.set_clip_state = nvfx_set_clip_state;
@@ -643,11 +386,4 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple;
 	nvfx->pipe.set_scissor_state = nvfx_set_scissor_state;
 	nvfx->pipe.set_viewport_state = nvfx_set_viewport_state;
-
-	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
-	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
-	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
-
-	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
-	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 9ceb2577ec..e9c1f2c26d 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -4,11 +4,11 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
 #include "nouveau/nouveau_statebuf.h"
+#include "util/u_dynarray.h"
+#include "util/u_linkage.h"
 
 struct nvfx_vertex_program_exec {
 	uint32_t data[4];
-	boolean has_branch_offset;
-	int const_index;
 };
 
 struct nvfx_vertex_program_data {
@@ -18,18 +18,20 @@ struct nvfx_vertex_program_data {
 
 struct nvfx_vertex_program {
 	struct pipe_shader_state pipe;
+	unsigned long long id;
 
 	struct draw_vertex_shader *draw;
 
 	boolean translated;
 
-	struct pipe_clip_state ucp;
-
 	struct nvfx_vertex_program_exec *insns;
 	unsigned nr_insns;
 	struct nvfx_vertex_program_data *consts;
 	unsigned nr_consts;
 
+	char generic_to_fp_input[256];
+	int sprite_fp_input;
+
 	struct nouveau_resource *exec;
 	unsigned exec_start;
 	struct nouveau_resource *data;
@@ -38,7 +40,10 @@ struct nvfx_vertex_program {
 
 	uint32_t ir;
 	uint32_t or;
-	uint32_t clip_ctrl;
+	int clip_nr;
+
+	struct util_dynarray branch_relocs;
+	struct util_dynarray const_relocs;
 };
 
 struct nvfx_fragment_program_data {
@@ -49,15 +54,14 @@ struct nvfx_fragment_program_data {
 struct nvfx_fragment_program_bo {
 	struct nvfx_fragment_program_bo* next;
 	struct nouveau_bo* bo;
+	unsigned char* slots;
 	char insn[] __attribute__((aligned(16)));
 };
 
 struct nvfx_fragment_program {
-	struct pipe_shader_state pipe;
-	struct tgsi_shader_info info;
-
-	boolean translated;
 	unsigned samplers;
+	unsigned point_sprite_control;
+	unsigned or;
 
 	uint32_t *insn;
 	int       insn_len;
@@ -65,13 +69,36 @@ struct nvfx_fragment_program {
 	struct nvfx_fragment_program_data *consts;
 	unsigned nr_consts;
 
+	/* the slot at num_slots is for the sprite coordinate, if any */
+	unsigned num_slots; /* how many input semantics? */
+	unsigned char slot_to_generic[10]; /* semantics */
+	unsigned char slot_to_fp_input[11]; /* current assignment of slots for each used semantic */
+	struct util_dynarray slot_relocations[11];
+
+	/* This is reset to progs on any relocation update, and decreases every time we
+	 * move to a new prog due to a constant update
+	 * When this is the same as progs, applying relocations is no longer necessary.
+	 */
+	unsigned progs_left_with_obsolete_slot_assignments;
+
+	unsigned long long last_vp_id;
+	unsigned last_sprite_coord_enable;
+
 	uint32_t fp_control;
 
 	unsigned bo_prog_idx;
 	unsigned prog_size;
 	unsigned progs_per_bo;
+	unsigned progs;
+
 	struct nvfx_fragment_program_bo* fpbo;
 };
 
+struct nvfx_pipe_fragment_program {
+        struct pipe_shader_state pipe;
+        struct tgsi_shader_info info;
+
+        struct nvfx_fragment_program* fps[2];
+};
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index f91ae19ecd..390bca8cdb 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -1,15 +1,54 @@
 #include "nvfx_context.h"
 #include "nvfx_state.h"
+#include "nvfx_resource.h"
 #include "draw/draw_context.h"
 
 static boolean
 nvfx_state_validate_common(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	unsigned dirty = nvfx->dirty;
+	unsigned dirty;
+	unsigned still_dirty = 0;
+	int all_swizzled = -1;
+	boolean flush_tex_cache = FALSE;
+	unsigned render_temps;
 
 	if(nvfx != nvfx->screen->cur_ctx)
-		dirty = ~0;
+	{
+		nvfx->dirty = ~0;
+		nvfx->hw_vtxelt_nr = 16;
+		nvfx->hw_pointsprite_control = -1;
+		nvfx->hw_vp_output = -1;
+		nvfx->screen->cur_ctx = nvfx;
+		nvfx->relocs_needed = NVFX_RELOCATE_ALL;
+	}
+
+	/* These can trigger use the of 3D engine to copy temporaries.
+	 * That will recurse here and thus dirty all 3D state, so we need to this before anything else, and in a loop..
+	 * This converges to having clean temps, then binding both fragtexes and framebuffers.
+	 */
+	while(nvfx->dirty & (NVFX_NEW_FB | NVFX_NEW_SAMPLER))
+	{
+		if(nvfx->dirty & NVFX_NEW_SAMPLER)
+		{
+			nvfx->dirty &=~ NVFX_NEW_SAMPLER;
+			nvfx_fragtex_validate(nvfx);
+
+			// TODO: only set this if really necessary
+			flush_tex_cache = TRUE;
+		}
+
+		if(nvfx->dirty & NVFX_NEW_FB)
+		{
+			nvfx->dirty &=~ NVFX_NEW_FB;
+			all_swizzled = nvfx_framebuffer_prepare(nvfx);
+
+			// TODO: make sure this doesn't happen, i.e. fbs have matching formats
+			assert(all_swizzled >= 0);
+		}
+	}
+
+	dirty = nvfx->dirty;
 
 	if(nvfx->render_mode == HW)
 	{
@@ -19,11 +58,19 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 				return FALSE;
 		}
 
-		if(dirty & (NVFX_NEW_ARRAYS))
+		if(dirty & NVFX_NEW_ARRAYS)
 		{
 			if(!nvfx_vbo_validate(nvfx))
 				return FALSE;
 		}
+
+		if(dirty & NVFX_NEW_INDEX)
+		{
+			if(nvfx->use_index_buffer)
+				nvfx_idxbuf_validate(nvfx);
+			else
+				still_dirty = NVFX_NEW_INDEX;
+		}
 	}
 	else
 	{
@@ -31,13 +78,10 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
 			nvfx_vertprog_validate(nvfx);
 
-		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG))
+		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_INDEX | NVFX_NEW_FRAGPROG))
 			nvfx_vtxfmt_validate(nvfx);
 	}
 
-	if(dirty & NVFX_NEW_FB)
-		nvfx_state_framebuffer_validate(nvfx);
-
 	if(dirty & NVFX_NEW_RAST)
 		sb_emit(chan, nvfx->rasterizer->sb, nvfx->rasterizer->sb_len);
 
@@ -47,11 +91,97 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	if(dirty & NVFX_NEW_STIPPLE)
 		nvfx_state_stipple_validate(nvfx);
 
-	if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))
+       if(nvfx->dirty & NVFX_NEW_UCP)
+	{
+		unsigned enables[7] =
+		{
+				0,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5,
+		};
+
+		if(!nvfx->use_vp_clipping)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+			OUT_RING(chan, 0);
+
+			WAIT_RING(chan, 6 * 4 + 1);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANE_A(0), nvfx->clip.nr * 4));
+			OUT_RINGp(chan, &nvfx->clip.ucp[0][0], nvfx->clip.nr * 4);
+		}
+
+		WAIT_RING(chan, 2);
+		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+		OUT_RING(chan, enables[nvfx->clip.nr]);
+	}
+
+	if(nvfx->use_vp_clipping && (nvfx->dirty & (NVFX_NEW_UCP | NVFX_NEW_VERTPROG)))
+	{
+		unsigned i;
+		struct nvfx_vertex_program* vp = nvfx->vertprog;
+		if(nvfx->clip.nr != vp->clip_nr)
+		{
+			unsigned idx;
+			WAIT_RING(chan, 14);
+
+			/* remove last instruction bit */
+			if(vp->clip_nr >= 0)
+			{
+				idx = vp->nr_insns - 7 + vp->clip_nr;
+				OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1));
+				OUT_RING(chan,  vp->exec->start + idx);
+				OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4));
+				OUT_RINGp (chan, vp->insns[idx].data, 4);
+			}
+
+			 /* set last instruction bit */
+			idx = vp->nr_insns - 7 + nvfx->clip.nr;
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1));
+			OUT_RING(chan,  vp->exec->start + idx);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4));
+			OUT_RINGp(chan, vp->insns[idx].data, 3);
+			OUT_RING(chan, vp->insns[idx].data[3] | 1);
+			vp->clip_nr = nvfx->clip.nr;
+		}
+
+		// TODO: only do this for the ones changed
+		WAIT_RING(chan, 6 * 6);
+		for(i = 0; i < nvfx->clip.nr; ++i)
+		{
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_CONST_ID, 5));
+			OUT_RING(chan, vp->data->start + i);
+			OUT_RINGp (chan, nvfx->clip.ucp[i], 4);
+		}
+	}
+
+	if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST | NVFX_NEW_VERTPROG | NVFX_NEW_SPRITE))
+	{
 		nvfx_fragprog_validate(nvfx);
+		if(dirty & NVFX_NEW_FRAGPROG)
+			flush_tex_cache = TRUE; // TODO: do we need this?
+	}
 
-	if(dirty & NVFX_NEW_SAMPLER)
-		nvfx_fragtex_validate(nvfx);
+	if(nvfx->is_nv4x)
+	{
+		unsigned vp_output = nvfx->vertprog->or | nvfx->hw_fragprog->or;
+		vp_output |= (1 << (nvfx->clip.nr + 6)) - (1 << 6);
+
+		if(vp_output != nvfx->hw_vp_output)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1));
+			OUT_RING(chan, vp_output);
+			nvfx->hw_vp_output = vp_output;
+		}
+	}
+
+	if(all_swizzled >= 0)
+		nvfx_framebuffer_validate(nvfx, all_swizzled);
 
 	if(dirty & NVFX_NEW_BLEND)
 		sb_emit(chan, nvfx->blend->sb, nvfx->blend->sb_len);
@@ -65,31 +195,62 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	if(dirty & NVFX_NEW_SR)
 		nvfx_state_sr_validate(nvfx);
 
-/* Having this depend on FB looks wrong, but it seems
-   necessary to make this work on nv3x
+/* All these dependencies are wrong, but otherwise
+   etracer, neverball, foobillard, glest totally misrender
    TODO: find the right fix
 */
-	if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_FB))
+	if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_RAST | NVFX_NEW_ZSA) || (all_swizzled >= 0))
+	{
 		nvfx_state_viewport_validate(nvfx);
+	}
+
+	if(dirty & NVFX_NEW_ZSA || (all_swizzled >= 0))
+	{
+		WAIT_RING(chan, 3);
+		OUT_RING(chan, RING_3D(NV34TCL_DEPTH_WRITE_ENABLE, 2));
+		OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.writemask);
+	        OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.enabled);
+	}
 
-	/* TODO: could nv30 need this or something similar too? */
-	if((dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SAMPLER)) && nvfx->is_nv4x) {
-		WAIT_RING(chan, 4);
-		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
-		OUT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
-		OUT_RING(chan, 1);
+	if(flush_tex_cache)
+	{
+		// TODO: what about nv30?
+		if(nvfx->is_nv4x)
+		{
+			WAIT_RING(chan, 4);
+			OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+			OUT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+			OUT_RING(chan, 1);
+		}
 	}
-	nvfx->dirty = 0;
+
+	nvfx->dirty = dirty & still_dirty;
+
+	render_temps = nvfx->state.render_temps;
+	if(render_temps)
+	{
+		for(int i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
+		{
+			if(render_temps & (1 << i))
+				util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.cbufs[i]),
+						(struct util_dirty_surface*)nvfx->framebuffer.cbufs[i]);
+		}
+
+		if(render_temps & 0x80)
+			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.zsbuf),
+					(struct util_dirty_surface*)nvfx->framebuffer.zsbuf);
+	}
+
 	return TRUE;
 }
 
-void
-nvfx_state_emit(struct nvfx_context *nvfx)
+inline void
+nvfx_state_relocate(struct nvfx_context *nvfx, unsigned relocs)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	/* we need to ensure there is enough space to output relocations in one go */
-	unsigned max_relocs = 0
+	const unsigned max_relocs = 0
 	      + 16 /* vertex buffers, incl. dma flag */
 	      + 2 /* index buffer plus format+dma flag */
 	      + 2 * 5 /* 4 cbufs + zsbuf, plus dma objects */
@@ -97,18 +258,19 @@ nvfx_state_emit(struct nvfx_context *nvfx)
 	      + 2 * 4 /* vertex textures plus format+dma flag */
 	      + 1 /* fragprog incl dma flag */
 	      ;
+
 	MARK_RING(chan, max_relocs * 2, max_relocs * 2);
-	nvfx_state_relocate(nvfx);
-}
 
-void
-nvfx_state_relocate(struct nvfx_context *nvfx)
-{
-	nvfx_framebuffer_relocate(nvfx);
-	nvfx_fragtex_relocate(nvfx);
-	nvfx_fragprog_relocate(nvfx);
-	if (nvfx->render_mode == HW)
+	if(relocs & NVFX_RELOCATE_FRAMEBUFFER)
+		nvfx_framebuffer_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_FRAGTEX)
+		nvfx_fragtex_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_FRAGPROG)
+		nvfx_fragprog_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_VTXBUF)
 		nvfx_vbo_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_IDXBUF)
+		nvfx_idxbuf_relocate(nvfx);
 }
 
 boolean
@@ -173,6 +335,9 @@ nvfx_state_validate_swtnl(struct nvfx_context *nvfx)
 		draw_set_vertex_elements(draw, nvfx->vtxelt->num_elements, nvfx->vtxelt->pipe);
 	}
 
+	if (nvfx->draw_dirty & NVFX_NEW_INDEX)
+		draw_set_index_buffer(draw, &nvfx->idxbuf);
+
 	nvfx_state_validate_common(nvfx);
 
 	nvfx->draw_dirty = 0;
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
index 360e569f77..3b869d43a1 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_fb.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -1,21 +1,55 @@
 #include "nvfx_context.h"
 #include "nvfx_resource.h"
-#include "nouveau/nouveau_util.h"
+#include "util/u_format.h"
 
+static inline boolean
+nvfx_surface_linear_renderable(struct pipe_surface* surf)
+{
+	return (surf->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)
+		&& !(surf->offset & 63)
+		&& !(((struct nvfx_surface*)surf)->pitch & 63);
+}
 
+static inline boolean
+nvfx_surface_swizzled_renderable(struct pipe_framebuffer_state* fb, struct pipe_surface* surf)
+{
+	/* TODO: return FALSE if we have a format not supporting swizzled rendering (e.g. r8); currently those are not supported at all */
+	return !((struct nvfx_miptree*)surf->texture)->linear_pitch
+		&& (surf->texture->target != PIPE_TEXTURE_3D || u_minify(surf->texture->depth0, surf->level) <= 1)
+		&& !(surf->offset & 127)
+		&& (surf->width == fb->width)
+		&& (surf->height == fb->height)
+		&& !((struct nvfx_surface*)surf)->temp;
+}
 
-void
-nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
+static boolean
+nvfx_surface_get_render_target(struct pipe_surface* surf, int all_swizzled, struct nvfx_render_target* target)
+{
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	if(!ns->temp)
+	{
+		target->bo = ((struct nvfx_miptree*)surf->texture)->base.bo;
+		target->offset = surf->offset;
+		target->pitch = align(ns->pitch, 64);
+		assert(target->pitch);
+		return FALSE;
+	}
+	else
+	{
+		target->offset = 0;
+		target->pitch = ns->temp->linear_pitch;
+		target->bo = ns->temp->base.bo;
+		assert(target->pitch);
+		return TRUE;
+	}
+}
+
+int
+nvfx_framebuffer_prepare(struct nvfx_context *nvfx)
 {
 	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
-	struct nouveau_channel *chan = nvfx->screen->base.channel;
-	uint32_t rt_enable = 0, rt_format = 0;
-	int i, colour_format = 0, zeta_format = 0;
-	int depth_only = 0;
-	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
-	unsigned w = fb->width;
-	unsigned h = fb->height;
-	int colour_bits = 32, zeta_bits = 32;
+	int i, color_format = 0, zeta_format = 0;
+	int all_swizzled = 1;
 
 	if(!nvfx->is_nv4x)
 		assert(fb->nr_cbufs <= 2);
@@ -23,113 +57,135 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 		assert(fb->nr_cbufs <= 4);
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
-		if (colour_format)
-			assert(colour_format == fb->cbufs[i]->format);
-		else
-			colour_format = fb->cbufs[i]->format;
-
-		rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
-		nvfx->hw_rt[i].bo = nvfx_surface_buffer(fb->cbufs[i]);
-		nvfx->hw_rt[i].offset = fb->cbufs[i]->offset;
-		nvfx->hw_rt[i].pitch = ((struct nv04_surface *)fb->cbufs[i])->pitch;
+		if (color_format) {
+			if(color_format != fb->cbufs[i]->format)
+				return -1;
+		} else
+			color_format = fb->cbufs[i]->format;
+
+		if(!nvfx_surface_swizzled_renderable(fb, fb->cbufs[i]))
+			all_swizzled = 0;
 	}
-	for(; i < 4; ++i)
-		nvfx->hw_rt[i].bo = 0;
 
+	if (fb->zsbuf) {
+		/* TODO: return FALSE if we have a format not supporting a depth buffer (e.g. r8); currently those are not supported at all */
+		if(!nvfx_surface_swizzled_renderable(fb, fb->zsbuf))
+			all_swizzled = 0;
+
+		if(all_swizzled && util_format_get_blocksize(color_format) != util_format_get_blocksize(zeta_format))
+			all_swizzled = 0;
+	}
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if(!((struct nvfx_surface*)fb->cbufs[i])->temp && !all_swizzled && !nvfx_surface_linear_renderable(fb->cbufs[i]))
+			nvfx_surface_create_temp(&nvfx->pipe, fb->cbufs[i]);
+	}
+
+	if(fb->zsbuf) {
+		if(!((struct nvfx_surface*)fb->zsbuf)->temp && !all_swizzled && !nvfx_surface_linear_renderable(fb->zsbuf))
+			nvfx_surface_create_temp(&nvfx->pipe, fb->zsbuf);
+	}
+
+	return all_swizzled;
+}
+
+void
+nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
+{
+	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	uint32_t rt_enable, rt_format;
+	int i;
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = (NV34TCL_RT_ENABLE_COLOR0 << fb->nr_cbufs) - 1;
 	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR1 |
 			 NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3))
 		rt_enable |= NV34TCL_RT_ENABLE_MRT;
 
+	nvfx->state.render_temps = 0;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		nvfx->state.render_temps |= nvfx_surface_get_render_target(fb->cbufs[i], prepare_result, &nvfx->hw_rt[i]) << i;
+
+	for(; i < 4; ++i)
+		nvfx->hw_rt[i].bo = 0;
+
 	if (fb->zsbuf) {
-		zeta_format = fb->zsbuf->format;
-		nvfx->hw_zeta.bo = nvfx_surface_buffer(fb->zsbuf);
-		nvfx->hw_zeta.offset = fb->zsbuf->offset;
-		nvfx->hw_zeta.pitch = ((struct nv04_surface *)fb->zsbuf)->pitch;
-	}
-	else
-		nvfx->hw_zeta.bo = 0;
-
-	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR0 | NV34TCL_RT_ENABLE_COLOR1 |
-		NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3)) {
-		/* Render to at least a colour buffer */
-		if (!(fb->cbufs[0]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
-			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
-			for (i = 1; i < fb->nr_cbufs; i++)
-				assert(!(fb->cbufs[i]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR));
-
-			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
-				(log2i(fb->cbufs[0]->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
-				(log2i(fb->cbufs[0]->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
-		}
-		else
-			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
-	} else if (fb->zsbuf) {
-		depth_only = 1;
-
-		/* Render to depth buffer only */
-		if (!(fb->zsbuf->texture->usage & NVFX_RESOURCE_FLAG_LINEAR)) {
-			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
-
-			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
-				(log2i(fb->zsbuf->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
-				(log2i(fb->zsbuf->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
-		}
-		else
-			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
-	} else {
-		return;
+		nvfx->state.render_temps |= nvfx_surface_get_render_target(fb->zsbuf, prepare_result, &nvfx->hw_zeta) << 7;
+
+		assert(util_format_get_stride(fb->zsbuf->format, fb->width) <= nvfx->hw_zeta.pitch);
+		assert(nvfx->hw_zeta.offset + nvfx->hw_zeta.pitch * fb->height <= nvfx->hw_zeta.bo->size);
 	}
 
-	switch (colour_format) {
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-		rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
-		break;
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-	case 0:
-		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
-		break;
-	case PIPE_FORMAT_B5G6R5_UNORM:
+	if (prepare_result) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+
+		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+			(util_logbase2(fb->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+			(util_logbase2(fb->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+	} else
+		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+
+	if(fb->nr_cbufs > 0) {
+		switch (fb->cbufs[0]->format) {
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+			rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
+			break;
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case 0:
+			rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+			break;
+		case PIPE_FORMAT_B5G6R5_UNORM:
+			rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+			break;
+		default:
+			assert(0);
+		}
+	} else if(fb->zsbuf && util_format_get_blocksize(fb->zsbuf->format) == 2)
 		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
-		colour_bits = 16;
-		break;
-	default:
-		assert(0);
-	}
+	else
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
 
-	switch (zeta_format) {
-	case PIPE_FORMAT_Z16_UNORM:
+	if(fb->zsbuf) {
+		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z16_UNORM:
+			rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+			break;
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		case PIPE_FORMAT_X8Z24_UNORM:
+		case 0:
+			rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+			break;
+		default:
+			assert(0);
+		}
+	} else if(fb->nr_cbufs && util_format_get_blocksize(fb->cbufs[0]->format) == 2)
 		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
-		zeta_bits = 16;
-		break;
-	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-	case PIPE_FORMAT_X8Z24_UNORM:
-	case 0:
+	else
 		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
-		break;
-	default:
-		assert(0);
-	}
 
-	if ((!nvfx->is_nv4x) && colour_bits > zeta_bits) {
-		/* TODO: does this limitation really exist?
-		   TODO: can it be worked around somehow? */
-		assert(0);
-	}
+	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0) || fb->zsbuf) {
+		struct nvfx_render_target *rt0 = &nvfx->hw_rt[0];
+		uint32_t pitch;
 
-	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0)
-		|| ((!nvfx->is_nv4x) && depth_only)) {
-		struct nvfx_render_target *rt0 = (depth_only ? &nvfx->hw_zeta : &nvfx->hw_rt[0]);
-		uint32_t pitch = rt0->pitch;
+		if(!(rt_enable & NV34TCL_RT_ENABLE_COLOR0))
+			rt0 = &nvfx->hw_zeta;
+
+		pitch = rt0->pitch;
 
 		if(!nvfx->is_nv4x)
 		{
-			if (nvfx->hw_zeta.bo) {
+			if (nvfx->hw_zeta.bo)
 				pitch |= (nvfx->hw_zeta.pitch << 16);
-			} else {
+			else
 				pitch |= (pitch << 16);
-			}
 		}
 
+		//printf("rendering to bo %p [%i] at offset %i with pitch %i\n", rt0->bo, rt0->bo->handle, rt0->offset, pitch);
+
 		OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR0, 1));
 		OUT_RELOC(chan, rt0->bo, 0,
 			      rt_flags | NOUVEAU_BO_OR,
@@ -182,7 +238,7 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 		}
 	}
 
-	if (zeta_format) {
+	if (fb->zsbuf) {
 		OUT_RING(chan, RING_3D(NV34TCL_DMA_ZETA, 1));
 		OUT_RELOC(chan, nvfx->hw_zeta.bo, 0,
 			      rt_flags | NOUVEAU_BO_OR,
@@ -196,6 +252,10 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 			OUT_RING(chan, nvfx->hw_zeta.pitch);
 		}
 	}
+	else if(nvfx->is_nv4x) {
+		OUT_RING(chan, RING_3D(NV40TCL_ZETA_PITCH, 1));
+		OUT_RING(chan, 64);
+	}
 
 	OUT_RING(chan, RING_3D(NV34TCL_RT_ENABLE, 1));
 	OUT_RING(chan, rt_enable);
@@ -218,6 +278,7 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TX_ORIGIN, 1));
 		OUT_RING(chan, 0);
 	}
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAMEBUFFER;
 }
 
 void
@@ -247,4 +308,5 @@ nvfx_framebuffer_relocate(struct nvfx_context *nvfx)
 	DO(NV40, 3);
 
 	DO_(nvfx->hw_zeta, NV34, ZETA);
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAMEBUFFER;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_state_stipple.c b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
index 4da968f093..b76e9dd382 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_stipple.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
@@ -4,23 +4,8 @@ void
 nvfx_state_stipple_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
-	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
 
-	if ((rast->poly_stipple_enable == 0 && nvfx->state.stipple_enabled == 0))
-		return;
-
-	if (rast->poly_stipple_enable) {
-		unsigned i;
-
-		WAIT_RING(chan, 35);
-		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
-		OUT_RING(chan, 1);
-		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32));
-		for (i = 0; i < 32; i++)
-			OUT_RING(chan, nvfx->stipple[i]);
-	} else {
-		WAIT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
-		OUT_RING(chan, 0);
-	}
+	WAIT_RING(chan, 33);
+	OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32));
+	OUT_RINGp(chan, nvfx->stipple, 32);
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
index a605d2b754..a5931b6e15 100644
--- a/src/gallium/drivers/nvfx/nvfx_surface.c
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -26,33 +26,421 @@
  *
  **************************************************************************/
 
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "util/u_blitter.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_screen.h"
 #include "nvfx_context.h"
+#include "nvfx_screen.h"
 #include "nvfx_resource.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
-#include "util/u_pack_color.h"
+#include "nv04_2d.h"
+
+#include <nouveau/nouveau_bo.h>
+
+static INLINE void
+nvfx_region_set_format(struct nv04_region* rgn, enum pipe_format format)
+{
+	unsigned bits = util_format_get_blocksizebits(format);
+	switch(bits)
+	{
+	case 8:
+		rgn->bpps = 0;
+		break;
+	case 16:
+		rgn->bpps = 1;
+		break;
+	case 32:
+		rgn->bpps = 2;
+		break;
+	default:
+		{
+			int shift;
+			assert(util_is_power_of_two(bits));
+			shift = util_logbase2(bits) - 3;
+			assert(shift >= 2);
+			rgn->bpps = 2;
+			shift -= 2;
+
+			rgn->x = util_format_get_nblocksx(format, rgn->x) << shift;
+			rgn->y = util_format_get_nblocksy(format, rgn->y);
+		}
+	}
+}
+
+static INLINE void
+nvfx_region_fixup_swizzled(struct nv04_region* rgn, unsigned zslice, unsigned width, unsigned height, unsigned depth)
+{
+	// TODO: move this code to surface creation?
+	if((depth <= 1) && (height <= 1 || width <= 2))
+		rgn->pitch = width << rgn->bpps;
+	else if(depth > 1 && height <= 2 && width <= 2)
+	{
+		rgn->pitch = width << rgn->bpps;
+		rgn->offset += (zslice * width * height) << rgn->bpps;
+	}
+	else
+	{
+		rgn->pitch = 0;
+		rgn->z = zslice;
+		rgn->w = width;
+		rgn->h = height;
+		rgn->d = depth;
+	}
+}
+
+static INLINE void
+nvfx_region_init_for_surface(struct nv04_region* rgn, struct nvfx_surface* surf, unsigned x, unsigned y, bool for_write)
+{
+	rgn->x = x;
+	rgn->y = y;
+	rgn->z = 0;
+	nvfx_region_set_format(rgn, surf->base.base.format);
+
+	if(surf->temp)
+	{
+		rgn->bo = surf->temp->base.bo;
+		rgn->offset = 0;
+		rgn->pitch = surf->temp->linear_pitch;
+
+		if(for_write)
+			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(&surf->base.base), &surf->base);
+	} else {
+		rgn->bo = ((struct nvfx_resource*)surf->base.base.texture)->bo;
+		rgn->offset = surf->base.base.offset;
+		rgn->pitch = surf->pitch;
+
+	        if(!(surf->base.base.texture->flags & NVFX_RESOURCE_FLAG_LINEAR))
+		        nvfx_region_fixup_swizzled(rgn, surf->base.base.zslice, surf->base.base.width, surf->base.base.height, u_minify(surf->base.base.texture->depth0, surf->base.base.level));
+	}
+}
+
+static INLINE void
+nvfx_region_init_for_subresource(struct nv04_region* rgn, struct pipe_resource* pt, struct pipe_subresource sub, unsigned x, unsigned y, unsigned z, bool for_write)
+{
+	if(pt->target != PIPE_BUFFER)
+	{
+		struct nvfx_surface* ns = (struct nvfx_surface*)util_surfaces_peek(&((struct nvfx_miptree*)pt)->surfaces, pt, sub.face, sub.level, z);
+		if(ns && util_dirty_surface_is_dirty(&ns->base))
+		{
+			nvfx_region_init_for_surface(rgn, ns, x, y, for_write);
+			return;
+		}
+	}
+
+	rgn->bo = ((struct nvfx_resource*)pt)->bo;
+	rgn->offset = nvfx_subresource_offset(pt, sub.face, sub.level, z);
+	rgn->pitch = nvfx_subresource_pitch(pt, sub.level);
+	rgn->x = x;
+	rgn->y = y;
+	rgn->z = 0;
+
+	nvfx_region_set_format(rgn, pt->format);
+	if(!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR))
+		nvfx_region_fixup_swizzled(rgn, z, u_minify(pt->width0, sub.level), u_minify(pt->height0, sub.level), u_minify(pt->depth0, sub.level));
+}
+
+// TODO: actually test this for all formats, it's probably wrong for some...
+
+static INLINE int
+nvfx_surface_format(enum pipe_format format)
+{
+	switch(util_format_get_blocksize(format)) {
+	case 1:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case 2:
+		//return NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case 4:
+		//if(format == PIPE_FORMAT_B8G8R8X8_UNORM || format == PIPE_FORMAT_B8G8R8A8_UNORM)
+			return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+		//else
+		//	return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch(util_format_get_blocksize(format)) {
+	case 1:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
+	case 2:
+		//if(format == PIPE_FORMAT_B5G5R5A1_UNORM)
+		//	return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+		//else
+			return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	case 4:
+		if(format == PIPE_FORMAT_B8G8R8X8_UNORM)
+			return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+		else
+			return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+// XXX: must save index buffer too!
+static struct blitter_context*
+nvfx_get_blitter(struct pipe_context* pipe, int copy)
+{
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+
+	struct blitter_context* blitter = nvfx->blitter;
+	if(!blitter)
+		nvfx->blitter = blitter = util_blitter_create(pipe);
+
+	util_blitter_save_blend(blitter, nvfx->blend);
+	util_blitter_save_depth_stencil_alpha(blitter, nvfx->zsa);
+	util_blitter_save_stencil_ref(blitter, &nvfx->stencil_ref);
+	util_blitter_save_rasterizer(blitter, nvfx->rasterizer);
+	util_blitter_save_fragment_shader(blitter, nvfx->fragprog);
+	util_blitter_save_vertex_shader(blitter, nvfx->vertprog);
+	util_blitter_save_viewport(blitter, &nvfx->viewport);
+	util_blitter_save_framebuffer(blitter, &nvfx->framebuffer);
+	util_blitter_save_clip(blitter, &nvfx->clip);
+	util_blitter_save_vertex_elements(blitter, nvfx->vtxelt);
+	util_blitter_save_vertex_buffers(blitter, nvfx->vtxbuf_nr, nvfx->vtxbuf);
+
+	if(copy)
+	{
+		util_blitter_save_fragment_sampler_states(blitter, nvfx->nr_samplers, (void**)nvfx->tex_sampler);
+		util_blitter_save_fragment_sampler_views(blitter, nvfx->nr_textures, nvfx->fragment_sampler_views);
+	}
+
+	return blitter;
+}
+
+static unsigned
+nvfx_region_clone(struct nv04_2d_context* ctx, struct nv04_region* rgn, unsigned w, unsigned h, boolean for_read)
+{
+	unsigned begin = nv04_region_begin(rgn, w, h);
+	unsigned end = nv04_region_end(rgn, w, h);
+	unsigned size = end - begin;
+	struct nouveau_bo* bo = 0;
+	nouveau_bo_new(rgn->bo->device, NOUVEAU_BO_MAP | NOUVEAU_BO_GART, 256, size, &bo);
+
+	if(for_read || (size > ((w * h) << rgn->bpps)))
+		nv04_memcpy(ctx, bo, 0, rgn->bo, rgn->offset + begin, size);
+
+	rgn->bo = bo;
+	rgn->offset = -begin;
+	return begin;
+}
 
 static void
-nvfx_surface_copy(struct pipe_context *pipe,
-		  struct pipe_resource *dest, struct pipe_subresource subdst,
-		  unsigned destx, unsigned desty, unsigned destz,
-		  struct pipe_resource *src, struct pipe_subresource subsrc,
+nvfx_resource_copy_region(struct pipe_context *pipe,
+		  struct pipe_resource *dstr, struct pipe_subresource subdst,
+		  unsigned dstx, unsigned dsty, unsigned dstz,
+		  struct pipe_resource *srcr, struct pipe_subresource subsrc,
 		  unsigned srcx, unsigned srcy, unsigned srcz,
-		  unsigned width, unsigned height)
+		  unsigned w, unsigned h)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
-	struct pipe_surface *ps_dst, *ps_src;
+	static int copy_threshold = -1;
+	struct nv04_2d_context *ctx = nvfx_screen(pipe->screen)->eng2d;
+	struct nv04_region dst, src;
+	int dst_to_gpu;
+	int src_on_gpu;
+	boolean small;
+	int ret;
+
+	if(!w || !h)
+		return;
+
+	if(copy_threshold < 0)
+		copy_threshold = debug_get_num_option("NOUVEAU_COPY_THRESHOLD", 4);
+
+	dst_to_gpu = dstr->usage != PIPE_USAGE_DYNAMIC && dstr->usage != PIPE_USAGE_STAGING;
+	src_on_gpu = nvfx_resource_on_gpu(srcr);
+
+	nvfx_region_init_for_subresource(&dst, dstr, subdst, dstx, dsty, dstz, TRUE);
+	nvfx_region_init_for_subresource(&src, srcr, subsrc, srcx, srcy, srcz, FALSE);
+	w = util_format_get_stride(dstr->format, w) >> dst.bpps;
+	h = util_format_get_nblocksy(dstr->format, h);
 
-	ps_src = nvfx_miptree_surface_new(pipe->screen, src, subsrc.face,
-					  subsrc.level, srcz, 0 /* bind flags */);
-	ps_dst = nvfx_miptree_surface_new(pipe->screen, dest, subdst.face,
-					  subdst.level, destz, 0 /* bindflags */);
+	small = (w * h <= copy_threshold);
+	if((!dst_to_gpu || !src_on_gpu) && small)
+		ret = -1; /* use the CPU */
+	else
+		ret = nv04_region_copy_2d(ctx, &dst, &src, w, h,
+			dstr->target == PIPE_BUFFER ? -1 : nvfx_surface_format(dstr->format),
+			dstr->target == PIPE_BUFFER ? -1 : nv04_scaled_image_format(dstr->format),
+			dst_to_gpu, src_on_gpu);
+	if(!ret)
+	{}
+	else if(ret > 0 && dstr->bind & PIPE_BIND_RENDER_TARGET && srcr->bind & PIPE_BIND_SAMPLER_VIEW)
+	{
+		struct blitter_context* blitter = nvfx_get_blitter(pipe, 1);
+		util_blitter_copy_region(blitter, dstr, subdst, dstx, dsty, dstz, srcr, subsrc, srcx, srcy, srcz, w, h, TRUE);
+	}
+	else
+	{
+		struct nv04_region dstt = dst;
+		struct nv04_region srct = src;
+		unsigned dstbegin = 0;
 
-	eng2d->copy(eng2d, ps_dst, destx, desty, ps_src, srcx, srcy, width, height);
+		if(!small)
+		{
+			if(src_on_gpu)
+				nvfx_region_clone(ctx, &srct, w, h, TRUE);
 
-	nvfx_miptree_surface_del(ps_src);
-	nvfx_miptree_surface_del(ps_dst);
+			if(dst_to_gpu)
+				dstbegin = nvfx_region_clone(ctx, &dstt, w, h, FALSE);
+		}
+
+		nv04_region_copy_cpu(&dstt, &srct, w, h);
+
+		if(srct.bo != src.bo)
+			nouveau_screen_bo_release(pipe->screen, srct.bo);
+
+		if(dstt.bo != dst.bo)
+		{
+			nv04_memcpy(ctx, dst.bo, dst.offset + dstbegin, dstt.bo, 0, dstt.bo->size);
+			nouveau_screen_bo_release(pipe->screen, dstt.bo);
+		}
+	}
+}
+
+static int
+nvfx_surface_fill(struct pipe_context* pipe, struct pipe_surface *dsts,
+		  unsigned dx, unsigned dy, unsigned w, unsigned h, unsigned value)
+{
+	struct nv04_2d_context *ctx = nvfx_screen(pipe->screen)->eng2d;
+	struct nv04_region dst;
+	int ret;
+	/* Always try to use the GPU right now, if possible
+	 * If the user wanted the surface data on the CPU, he would have cleared with memset (hopefully) */
+
+	// we don't care about interior pixel order since we set all them to the same value
+	nvfx_region_init_for_surface(&dst, (struct nvfx_surface*)dsts, dx, dy, TRUE);
+
+	w = util_format_get_stride(dsts->format, w) >> dst.bpps;
+	h = util_format_get_nblocksy(dsts->format, h);
+
+	ret = nv04_region_fill_2d(ctx, &dst, w, h, value);
+	if(ret > 0 && dsts->texture->bind & PIPE_BIND_RENDER_TARGET)
+		return 1;
+	else if(ret)
+	{
+		struct nv04_region dstt = dst;
+		unsigned dstbegin = 0;
+
+		if(nvfx_resource_on_gpu(dsts->texture))
+			dstbegin = nvfx_region_clone(ctx, &dstt, w, h, FALSE);
+
+		nv04_region_fill_cpu(&dstt, w, h, value);
+
+		if(dstt.bo != dst.bo)
+		{
+			nv04_memcpy(ctx, dst.bo, dst.offset + dstbegin, dstt.bo, 0, dstt.bo->size);
+			nouveau_screen_bo_release(pipe->screen, dstt.bo);
+		}
+	}
+
+	return 0;
+}
+
+
+void
+nvfx_screen_surface_takedown(struct pipe_screen *pscreen)
+{
+	nv04_2d_context_takedown(nvfx_screen(pscreen)->eng2d);
+	nvfx_screen(pscreen)->eng2d = 0;
+}
+
+int
+nvfx_screen_surface_init(struct pipe_screen *pscreen)
+{
+	struct nv04_2d_context* ctx = nv04_2d_context_init(nouveau_screen(pscreen)->channel);
+	if(!ctx)
+		return -1;
+	nvfx_screen(pscreen)->eng2d = ctx;
+	return 0;
+}
+
+static void
+nvfx_surface_copy_temp(struct pipe_context* pipe, struct pipe_surface* surf, int to_temp)
+{
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	struct pipe_subresource tempsr, surfsr;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+
+	// TODO: we really should do this validation before setting these variable in draw calls
+	unsigned use_vertex_buffers = nvfx->use_vertex_buffers;
+	boolean use_index_buffer = nvfx->use_index_buffer;
+	unsigned base_vertex = nvfx->base_vertex;
+
+	tempsr.face = 0;
+	tempsr.level = 0;
+	surfsr.face = surf->face;
+	surfsr.level = surf->level;
+
+	if(to_temp)
+		nvfx_resource_copy_region(pipe, &ns->temp->base.base, tempsr, 0, 0, 0, surf->texture, surfsr, 0, 0, surf->zslice, surf->width, surf->height);
+	else
+		nvfx_resource_copy_region(pipe, surf->texture, surfsr, 0, 0, surf->zslice, &ns->temp->base.base, tempsr, 0, 0, 0, surf->width, surf->height);
+
+	nvfx->use_vertex_buffers = use_vertex_buffers;
+	nvfx->use_index_buffer = use_index_buffer;
+        nvfx->base_vertex = base_vertex;
+
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
+
+void
+nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf)
+{
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	struct pipe_resource template;
+	memset(&template, 0, sizeof(struct pipe_resource));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = surf->format;
+	template.width0 = surf->width;
+	template.height0 = surf->height;
+	template.depth0 = 1;
+	template.nr_samples = surf->texture->nr_samples;
+	template.flags = NVFX_RESOURCE_FLAG_LINEAR;
+
+	ns->temp = (struct nvfx_miptree*)nvfx_miptree_create(pipe->screen, &template);
+	nvfx_surface_copy_temp(pipe, surf, 1);
+}
+
+void
+nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf)
+{
+	struct nvfx_context* nvfx = (struct nvfx_context*)pipe;
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	boolean bound = FALSE;
+
+	/* must be done before the copy, otherwise the copy will use the temp as destination */
+	util_dirty_surface_set_clean(nvfx_surface_get_dirty_surfaces(surf), &ns->base);
+
+	nvfx_surface_copy_temp(pipe, surf, 0);
+
+	if(nvfx->framebuffer.zsbuf == surf)
+		bound = TRUE;
+	else
+	{
+		for(unsigned i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
+		{
+			if(nvfx->framebuffer.cbufs[i] == surf)
+			{
+				bound = TRUE;
+				break;
+			}
+		}
+	}
+
+	if(!bound)
+		pipe_resource_reference((struct pipe_resource**)&ns->temp, 0);
 }
 
 static void
@@ -62,12 +450,16 @@ nvfx_clear_render_target(struct pipe_context *pipe,
 			 unsigned dstx, unsigned dsty,
 			 unsigned width, unsigned height)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
 	union util_color uc;
 	util_pack_color(rgba, dst->format, &uc);
 
-	eng2d->fill(eng2d, dst, dstx, dsty, width, height, uc.ui);
+	if(util_format_get_blocksizebits(dst->format) > 32
+		|| nvfx_surface_fill(pipe, dst, dstx, dsty, width, height, uc.ui))
+	{
+		// TODO: probably should use hardware clear here instead if possible
+		struct blitter_context* blitter = nvfx_get_blitter(pipe, 0);
+		util_blitter_clear_render_target(blitter, dst, rgba, dstx, dsty, width, height);
+	}
 }
 
 static void
@@ -79,18 +471,20 @@ nvfx_clear_depth_stencil(struct pipe_context *pipe,
 			 unsigned dstx, unsigned dsty,
 			 unsigned width, unsigned height)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
-
-	eng2d->fill(eng2d, dst, dstx, dsty, width, height,
-		    util_pack_z_stencil(dst->format, depth, stencil));
+	if(util_format_get_blocksizebits(dst->format) > 32
+		|| nvfx_surface_fill(pipe, dst, dstx, dsty, width, height, util_pack_z_stencil(dst->format, depth, stencil)))
+	{
+		// TODO: probably should use hardware clear here instead if possible
+		struct blitter_context* blitter = nvfx_get_blitter(pipe, 0);
+		util_blitter_clear_depth_stencil(blitter, dst, clear_flags, depth, stencil, dstx, dsty, width, height);
+	}
 }
 
 
 void
 nvfx_init_surface_functions(struct nvfx_context *nvfx)
 {
-	nvfx->pipe.resource_copy_region = nvfx_surface_copy;
+	nvfx->pipe.resource_copy_region = nvfx_resource_copy_region;
 	nvfx->pipe.clear_render_target = nvfx_clear_render_target;
 	nvfx->pipe.clear_depth_stencil = nvfx_clear_depth_stencil;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_tex.h b/src/gallium/drivers/nvfx/nvfx_tex.h
index 69187a79e7..34be936a89 100644
--- a/src/gallium/drivers/nvfx/nvfx_tex.h
+++ b/src/gallium/drivers/nvfx/nvfx_tex.h
@@ -1,6 +1,11 @@
 #ifndef NVFX_TEX_H_
 #define NVFX_TEX_H_
 
+#include "util/u_math.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include <nouveau/nouveau_class.h>
+
 static inline unsigned
 nvfx_tex_wrap_mode(unsigned wrap) {
 	unsigned ret;
@@ -31,7 +36,7 @@ nvfx_tex_wrap_mode(unsigned wrap) {
 		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
 		break;
 	default:
-		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		assert(0);
 		ret = NV34TCL_TX_WRAP_S_REPEAT;
 		break;
 	}
@@ -40,31 +45,29 @@ nvfx_tex_wrap_mode(unsigned wrap) {
 }
 
 static inline unsigned
-nvfx_tex_wrap_compare_mode(const struct pipe_sampler_state* cso)
+nvfx_tex_wrap_compare_mode(unsigned func)
 {
-	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-		switch (cso->compare_func) {
-		case PIPE_FUNC_NEVER:
-			return NV34TCL_TX_WRAP_RCOMP_NEVER;
-		case PIPE_FUNC_GREATER:
-			return NV34TCL_TX_WRAP_RCOMP_GREATER;
-		case PIPE_FUNC_EQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_EQUAL;
-		case PIPE_FUNC_GEQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_GEQUAL;
-		case PIPE_FUNC_LESS:
-			return NV34TCL_TX_WRAP_RCOMP_LESS;
-		case PIPE_FUNC_NOTEQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
-		case PIPE_FUNC_LEQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_LEQUAL;
-		case PIPE_FUNC_ALWAYS:
-			return NV34TCL_TX_WRAP_RCOMP_ALWAYS;
-		default:
-			break;
-		}
+	switch (func) {
+	case PIPE_FUNC_NEVER:
+		return NV34TCL_TX_WRAP_RCOMP_NEVER;
+	case PIPE_FUNC_GREATER:
+		return NV34TCL_TX_WRAP_RCOMP_GREATER;
+	case PIPE_FUNC_EQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_EQUAL;
+	case PIPE_FUNC_GEQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+	case PIPE_FUNC_LESS:
+		return NV34TCL_TX_WRAP_RCOMP_LESS;
+	case PIPE_FUNC_NOTEQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+	case PIPE_FUNC_LEQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+	case PIPE_FUNC_ALWAYS:
+		return NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+	default:
+		assert(0);
+		return 0;
 	}
-	return 0;
 }
 
 static inline unsigned nvfx_tex_filter(const struct pipe_sampler_state* cso)
@@ -128,6 +131,45 @@ struct nvfx_sampler_state {
 	uint32_t en;
 	uint32_t filt;
 	uint32_t bcol;
+	uint32_t min_lod;
+	uint32_t max_lod;
+	boolean compare;
+};
+
+struct nvfx_sampler_view {
+	struct pipe_sampler_view base;
+	int offset;
+	uint32_t swizzle;
+	uint32_t npot_size;
+	uint32_t filt;
+	uint32_t wrap_mask;
+	uint32_t wrap;
+	uint32_t lod_offset;
+	uint32_t max_lod_limit;
+	union
+	{
+		struct
+		{
+			uint32_t fmt[4]; /* nv30 has 4 entries, nv40 one */
+			int rect;
+		} nv30;
+		struct
+		{
+			uint32_t fmt[2]; /* nv30 has 4 entries, nv40 one */
+			uint32_t npot_size2; /* nv40 only */
+		} nv40;
+		uint32_t init_fmt;
+	} u;
 };
 
+struct nvfx_texture_format {
+	int fmt[6];
+	unsigned sign;
+	unsigned wrap;
+	unsigned char src[6];
+	unsigned char comp[6];
+};
+
+extern struct nvfx_texture_format nvfx_texture_formats[PIPE_FORMAT_COUNT];
+
 #endif /* NVFX_TEX_H_ */
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c
index 9ff0a93d30..7cb47a20f6 100644
--- a/src/gallium/drivers/nvfx/nvfx_transfer.c
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.c
@@ -4,204 +4,218 @@
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
-#include "nouveau/nouveau_winsys.h"
+#include "util/u_staging.h"
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
 #include "nvfx_state.h"
 #include "nvfx_resource.h"
 #include "nvfx_transfer.h"
 
-struct nvfx_transfer {
-	struct pipe_transfer base;
-	struct pipe_surface *surface;
-	boolean direct;
-};
-
-static void
-nvfx_compatible_transfer_tex(struct pipe_resource *pt, unsigned width, unsigned height,
-			     unsigned bind,
-                             struct pipe_resource *template)
-{
-	memset(template, 0, sizeof(struct pipe_resource));
-	template->target = pt->target;
-	template->format = pt->format;
-	template->width0 = width;
-	template->height0 = height;
-	template->depth0 = 1;
-	template->last_level = 0;
-	template->nr_samples = pt->nr_samples;
-	template->bind = bind;
-	template->usage = PIPE_USAGE_DYNAMIC;
-	template->flags = NVFX_RESOURCE_FLAG_LINEAR;
-}
-
-
-static unsigned nvfx_transfer_bind_flags( unsigned transfer_usage )
+struct nvfx_staging_transfer
 {
-	unsigned bind = 0;
+	struct util_staging_transfer base;
 
-#if 0
-	if (transfer_usage & PIPE_TRANSFER_WRITE)
-		bind |= PIPE_BIND_BLIT_SOURCE;
-
-	if (transfer_usage & PIPE_TRANSFER_READ)
-		bind |= PIPE_BIND_BLIT_DESTINATION;
-#endif
-
-	return bind;
-}
+	unsigned offset;
+	unsigned map_count;
+};
 
 struct pipe_transfer *
-nvfx_miptree_transfer_new(struct pipe_context *pipe,
+nvfx_transfer_new(struct pipe_context *pipe,
 			  struct pipe_resource *pt,
 			  struct pipe_subresource sr,
 			  unsigned usage,
 			  const struct pipe_box *box)
 {
-	struct pipe_screen *pscreen = pipe->screen;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
-	struct nvfx_transfer *tx;
-	struct pipe_resource tx_tex_template, *tx_tex;
-	static int no_transfer = -1;
-	unsigned bind = nvfx_transfer_bind_flags(usage);
-	if(no_transfer < 0)
-		no_transfer = debug_get_bool_option("NOUVEAU_NO_TRANSFER", FALSE);
-
-
-	tx = CALLOC_STRUCT(nvfx_transfer);
-	if (!tx)
-		return NULL;
-
-	/* Don't handle 3D transfers yet.
-	 */
-	assert(box->depth == 1);
-
-	pipe_resource_reference(&tx->base.resource, pt);
-	tx->base.sr = sr;
-	tx->base.usage = usage;
-	tx->base.box = *box;
-	tx->base.stride = mt->level[sr.level].pitch;
-
-	/* Direct access to texture */
-	if ((pt->usage == PIPE_USAGE_DYNAMIC ||
-	     no_transfer) &&
-	    pt->flags & NVFX_RESOURCE_FLAG_LINEAR)
+        if((usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_DONTBLOCK)) == PIPE_TRANSFER_DONTBLOCK)
+        {
+                struct nouveau_bo* bo = ((struct nvfx_resource*)pt)->bo;
+                if(bo && nouveau_bo_busy(bo, NOUVEAU_BO_WR))
+                        return NULL;
+        }
+
+	if(pt->target == PIPE_BUFFER)
 	{
-		tx->direct = true;
-
-		/* XXX: just call the internal nvfx function.  
-		 */
-		tx->surface = pscreen->get_tex_surface(pscreen, pt,
-	                                               sr.face, sr.level,
-						       box->z,
-	                                               bind);
-		return &tx->base;
-	}
+		// it would be nice if we could avoid all this ridiculous overhead...
+		struct pipe_transfer* tx;
+		struct nvfx_buffer* buffer = nvfx_buffer(pt);
+
+		tx = CALLOC_STRUCT(pipe_transfer);
+		if (!tx)
+			return NULL;
 
-	tx->direct = false;
+		pipe_resource_reference(&tx->resource, pt);
+		tx->sr = sr;
+		tx->usage = usage;
+		tx->box = *box;
 
-	nvfx_compatible_transfer_tex(pt, box->width, box->height, bind, &tx_tex_template);
+		tx->slice_stride = tx->stride = util_format_get_stride(pt->format, box->width);
+		tx->data = buffer->data + util_format_get_stride(pt->format, box->x);
 
-	tx_tex = pscreen->resource_create(pscreen, &tx_tex_template);
-	if (!tx_tex)
+		return tx;
+	}
+	else
 	{
-		FREE(tx);
-		return NULL;
+	        struct nvfx_staging_transfer* tx;
+	        bool direct = !nvfx_resource_on_gpu(pt) && pt->flags & NVFX_RESOURCE_FLAG_LINEAR;
+
+	        tx = CALLOC_STRUCT(nvfx_staging_transfer);
+	        if(!tx)
+	        	return NULL;
+
+	        util_staging_transfer_init(pipe, pt, sr, usage, box, direct, &tx->base);
+
+		if(direct)
+		{
+			tx->base.base.stride = nvfx_subresource_pitch(pt, sr.level);
+			tx->base.base.slice_stride = tx->base.base.stride * u_minify(pt->height0, sr.level);
+			tx->offset = nvfx_subresource_offset(pt, sr.face, sr.level, box->z)
+				+ util_format_get_2d_size(pt->format, tx->base.base.stride, box->y)
+				+ util_format_get_stride(pt->format, box->x);
+		}
+		else
+		{
+			tx->base.base.stride = nvfx_subresource_pitch(tx->base.staging_resource, 0);
+			tx->base.base.slice_stride = tx->base.base.stride * tx->base.staging_resource->height0;
+			tx->offset = 0;
+		}
+
+		assert(tx->base.base.stride);
+
+		return &tx->base.base;
 	}
+}
 
-	tx->base.stride = ((struct nvfx_miptree*)tx_tex)->level[0].pitch;
-
-	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
-	                                       0, 0, 0,
-	                                       bind);
-
-	pipe_resource_reference(&tx_tex, NULL);
-
-	if (!tx->surface)
+static void nvfx_buffer_dirty_interval(struct nvfx_buffer* buffer, unsigned begin, unsigned size, boolean unsynchronized)
+{
+	struct nvfx_screen* screen = nvfx_screen(buffer->base.base.screen);
+	buffer->last_update_static = buffer->bytes_to_draw_until_static < 0;
+	if(buffer->dirty_begin == buffer->dirty_end)
 	{
-		pipe_surface_reference(&tx->surface, NULL);
-		FREE(tx);
-		return NULL;
+		buffer->dirty_begin = begin;
+		buffer->dirty_end = begin + size;
+		buffer->dirty_unsynchronized = unsynchronized;
+	}
+	else
+	{
+		buffer->dirty_begin = MIN2(buffer->dirty_begin, begin);
+		buffer->dirty_end = MAX2(buffer->dirty_end, begin + size);
+		buffer->dirty_unsynchronized &= unsynchronized;
 	}
 
-	if (usage & PIPE_TRANSFER_READ) {
-		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
-		struct pipe_surface *src;
+	if(unsynchronized)
+	{
+		// TODO: revisit this, it doesn't seem quite right
+		//printf("UNSYNC UPDATE %p %u %u\n", buffer, begin, size);
+		buffer->bytes_to_draw_until_static += size * screen->static_reuse_threshold;
+	}
+	else
+		buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+}
 
-		src = pscreen->get_tex_surface(pscreen, pt,
-	                                       sr.face, sr.level, box->z,
-	                                       0 /*PIPE_BIND_BLIT_SOURCE*/);
+static void nvfx_transfer_flush_region( struct pipe_context *pipe,
+				      struct pipe_transfer *ptx,
+				      const struct pipe_box *box)
+{
+	if(ptx->resource->target == PIPE_BUFFER && (ptx->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data + util_format_get_stride(buffer->base.base.format, box->x),
+				util_format_get_stride(buffer->base.base.format, box->width),
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
+}
 
-		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
-		/* TODO: Check if SIFM can un-swizzle */
-		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      tx->surface, 0, 0,
-		                      src,
-				      box->x, box->y,
-		                      box->width, box->height);
+static void
+nvfx_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	if(ptx->resource->target == PIPE_BUFFER)
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		if((ptx->usage & (PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT)) == PIPE_TRANSFER_WRITE)
+			nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data,
+				ptx->stride,
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+		pipe_resource_reference(&ptx->resource, 0);
+		FREE(ptx);
+	}
+	else
+	{
+		struct nouveau_channel* chan = nvfx_context(pipe)->screen->base.channel;
+		util_staging_transfer_destroy(pipe, ptx);
 
-		pipe_surface_reference(&src, NULL);
+		FIRE_RING(chan);
 	}
+}
 
-	return &tx->base;
+void *
+nvfx_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	if(ptx->resource->target == PIPE_BUFFER)
+		return ptx->data;
+	else
+	{
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		if(!ptx->data)
+		{
+			struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+			uint8_t *map = nouveau_screen_bo_map(pipe->screen, mt->base.bo, nouveau_screen_transfer_flags(ptx->usage));
+			ptx->data = map + tx->offset;
+		}
+
+		++tx->map_count;
+		return ptx->data;
+	}
 }
 
 void
-nvfx_miptree_transfer_del(struct pipe_context *pipe,
-			  struct pipe_transfer *ptx)
+nvfx_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
 {
-	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
-
-	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
-		struct pipe_screen *pscreen = pipe->screen;
-		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
-		struct pipe_surface *dst;
-
-		dst = pscreen->get_tex_surface(pscreen,
-					       ptx->resource,
-	                                       ptx->sr.face,
-					       ptx->sr.level,
-					       ptx->box.z,
-	                                       0 /*PIPE_BIND_BLIT_DESTINATION*/);
-
-		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
-		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, ptx->box.x, ptx->box.y,
-		                      tx->surface, 0, 0,
-		                      ptx->box.width, ptx->box.height);
-
-		pipe_surface_reference(&dst, NULL);
+	if(ptx->resource->target != PIPE_BUFFER)
+	{
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+
+		if(!--tx->map_count)
+		{
+			nouveau_screen_bo_unmap(pipe->screen, mt->base.bo);
+			ptx->data = 0;
+		}
 	}
-
-	pipe_surface_reference(&tx->surface, NULL);
-	pipe_resource_reference(&ptx->resource, NULL);
-	FREE(ptx);
 }
 
-void *
-nvfx_miptree_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
+static void nvfx_transfer_inline_write( struct pipe_context *pipe,
+				      struct pipe_resource *pr,
+				      struct pipe_subresource sr,
+				      unsigned usage,
+				      const struct pipe_box *box,
+				      const void *data,
+				      unsigned stride,
+				      unsigned slice_stride)
 {
-	struct pipe_screen *pscreen = pipe->screen;
-	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
-	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
-	uint8_t *map = nouveau_screen_bo_map(pscreen, mt->base.bo,
-					     nouveau_screen_transfer_flags(ptx->usage));
-
-	if(!tx->direct)
-		return map + ns->base.offset;
+	if(pr->target != PIPE_BUFFER)
+	{
+		u_default_transfer_inline_write(pipe, pr, sr, usage, box, data, stride, slice_stride);
+	}
 	else
-		return (map + ns->base.offset + 
-			ptx->box.y * ns->pitch + 
-			ptx->box.x * util_format_get_blocksize(ptx->resource->format));
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(pr);
+		unsigned begin = util_format_get_stride(pr->format, box->x);
+		unsigned size = util_format_get_stride(pr->format, box->width);
+		memcpy(buffer->data + begin, data, size);
+		nvfx_buffer_dirty_interval(buffer, begin, size,
+				!!(pr->flags & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
 }
 
 void
-nvfx_miptree_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
+nvfx_init_transfer_functions(struct pipe_context *pipe)
 {
-	struct pipe_screen *pscreen = pipe->screen;
-	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
-
-	nouveau_screen_bo_unmap(pscreen, mt->base.bo);
+	pipe->get_transfer = nvfx_transfer_new;
+	pipe->transfer_map = nvfx_transfer_map;
+	pipe->transfer_flush_region = nvfx_transfer_flush_region;
+	pipe->transfer_unmap = nvfx_transfer_unmap;
+	pipe->transfer_destroy = nvfx_transfer_destroy;
+	pipe->transfer_inline_write = nvfx_transfer_inline_write;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.h b/src/gallium/drivers/nvfx/nvfx_transfer.h
index 3e3317b2c7..20f20d5b0b 100644
--- a/src/gallium/drivers/nvfx/nvfx_transfer.h
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.h
@@ -7,19 +7,17 @@
 
 
 struct pipe_transfer *
-nvfx_miptree_transfer_new(struct pipe_context *pcontext,
+nvfx_transfer_new(struct pipe_context *pcontext,
 			  struct pipe_resource *pt,
 			  struct pipe_subresource sr,
 			  unsigned usage,
 			  const struct pipe_box *box);
-void
-nvfx_miptree_transfer_del(struct pipe_context *pcontext,
-			  struct pipe_transfer *ptx);
+
 void *
-nvfx_miptree_transfer_map(struct pipe_context *pcontext,
+nvfx_transfer_map(struct pipe_context *pcontext,
 			  struct pipe_transfer *ptx);
 void
-nvfx_miptree_transfer_unmap(struct pipe_context *pcontext,
+nvfx_transfer_unmap(struct pipe_context *pcontext,
 			    struct pipe_transfer *ptx);
 
 
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
index 4aa3793842..e6e9a8f2e4 100644
--- a/src/gallium/drivers/nvfx/nvfx_vbo.c
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -2,6 +2,7 @@
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "translate/translate.h"
 
 #include "nvfx_context.h"
 #include "nvfx_state.h"
@@ -10,646 +11,595 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_class.h"
 #include "nouveau/nouveau_pushbuf.h"
-#include "nouveau/nouveau_util.h"
 
-static INLINE int
-nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+static inline unsigned
+util_guess_unique_indices_count(unsigned mode, unsigned indices)
 {
-	switch (pipe) {
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
-		break;
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_HALF;
-		break;
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
-		break;
-	case PIPE_FORMAT_R16_SSCALED:
-	case PIPE_FORMAT_R16G16_SSCALED:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+	/* Euler's formula gives V =
+	 * = E - F + 2 =
+	 * = F * (polygon_edges / 2 - 1) + 2 =
+	 * =  F * (polygon_edges - 2) / 2 + 2 =
+	 * =  indices * (polygon_edges - 2) / (2 * indices_per_face) + 2
+	 * =  indices * (1 / 2 - 1 / polygon_edges) + 2
+	 */
+	switch(mode)
+	{
+	case PIPE_PRIM_LINES:
+		return indices >> 1;
+	case PIPE_PRIM_TRIANGLES:
+	{
+		// avoid an expensive division by 3 using the multiplicative inverse mod 2^32
+		unsigned q;
+		unsigned inv3 = 2863311531;
+		indices >>= 1;
+		q = indices * inv3;
+		if(unlikely(q >= indices))
+		{
+			q += inv3;
+			if(q >= indices)
+				q += inv3;
+		}
+		return indices + 2;
+		//return indices / 6 + 2;
 	}
-
-	switch (pipe) {
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16_SSCALED:
-		*ncomp = 1;
-		break;
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16_SSCALED:
-		*ncomp = 2;
-		break;
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-		*ncomp = 3;
-		break;
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*ncomp = 4;
-		break;
+	// guess that indexed quads are created by successive connections, since a closed mesh seems unlikely
+	case PIPE_PRIM_QUADS:
+		return (indices >> 1) + 2;
+	//	return (indices >> 2) + 2; // if it is a closed mesh
 	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+		return indices;
 	}
-
-	return 0;
 }
 
-static boolean
-nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_resource *ib,
-		    unsigned ib_size)
+static unsigned nvfx_decide_upload_mode(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
-	unsigned type;
-
-	if (!ib) {
-		nvfx->idxbuf_buffer = NULL;
-		nvfx->idxbuf_format = 0xdeadbeef;
-		return FALSE;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+	unsigned hardware_cost = 0;
+	unsigned inline_cost = 0;
+	unsigned unique_vertices;
+	unsigned upload_mode;
+	float best_index_cost_for_hardware_vertices_as_inline_cost;
+	boolean prefer_hardware_indices;
+	unsigned index_inline_cost;
+	unsigned index_hardware_cost;
+	if (info->indexed)
+		unique_vertices = util_guess_unique_indices_count(info->mode, info->count);
+	else
+		unique_vertices = info->count;
+
+	/* Here we try to figure out if we are better off writing vertex data directly on the FIFO,
+	 * or create hardware buffer objects and pointing the hardware to them.
+	 *
+	 * This is done by computing the total memcpy cost of each option, ignoring uploads
+	 * if we think that the buffer is static and thus the upload cost will be amortized over
+	 * future draw calls.
+	 *
+	 * For instance, if everything looks static, we will always create buffer objects, while if
+	 * everything is a user buffer and we are not doing indexed drawing, we never do.
+	 *
+	 * Other interesting cases are where a small user vertex buffer, but a huge user index buffer,
+	 * where we will upload the vertex buffer, so that we can use hardware index lookup, and
+	 * the opposite case, where we instead do index lookup in software to avoid uploading
+	 * a huge amount of vertex data that is not going to be used.
+	 *
+	 * Otherwise, we generally move to the GPU the after it has been pushed
+	 * NVFX_STATIC_BUFFER_MIN_REUSE_TIMES times to the GPU without having
+	 * been updated with a transfer (or just the buffer having been destroyed).
+	 *
+	 * There is no special handling for user buffers, since applications can use
+	 * OpenGL VBOs in a one-shot fashion. OpenGL 3/4 core profile forces this
+	 * by the way.
+	 *
+	 * Note that currently we don't support only putting some data on the FIFO, and
+	 * some on vertex buffers (constant and instanced data is independent from this).
+	 *
+	 * nVidia doesn't seem to do this either, even though it should be at least
+	 * doable with VTX_ATTR and possibly with VERTEX_DATA too if not indexed.
+	 */
+
+	for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+	{
+		struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		buffer->bytes_to_draw_until_static -= vbi->per_vertex_size * unique_vertices;
+		if (!nvfx_buffer_seems_static(buffer))
+		{
+			hardware_cost += buffer->dirty_end - buffer->dirty_begin;
+			if (!buffer->base.bo)
+				hardware_cost += nvfx->screen->buffer_allocation_cost;
+		}
+		inline_cost += vbi->per_vertex_size * info->count;
 	}
 
-	if (!nvfx->screen->index_buffer_reloc_flags || ib_size == 1)
-		return FALSE;
+	best_index_cost_for_hardware_vertices_as_inline_cost = 0.0f;
+	prefer_hardware_indices = FALSE;
+	index_inline_cost = 0;
+	index_hardware_cost = 0;
 
-	switch (ib_size) {
-	case 2:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
-		break;
-	case 4:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
-		break;
-	default:
-		return FALSE;
-	}
+	if (info->indexed)
+	{
+		index_inline_cost = nvfx->idxbuf.index_size * info->count;
+		if (nvfx->screen->index_buffer_reloc_flags
+			&& (nvfx->idxbuf.index_size == 2 || nvfx->idxbuf.index_size == 4)
+			&& !(nvfx->idxbuf.offset & (nvfx->idxbuf.index_size - 1)))
+		{
+			struct nvfx_buffer* buffer = nvfx_buffer(nvfx->idxbuf.buffer);
+			buffer->bytes_to_draw_until_static -= index_inline_cost;
 
-	if (ib != nvfx->idxbuf_buffer ||
-	    type != nvfx->idxbuf_format) {
-		nvfx->dirty |= NVFX_NEW_ARRAYS;
-		nvfx->idxbuf_buffer = ib;
-		nvfx->idxbuf_format = type;
-	}
+			prefer_hardware_indices = TRUE;
 
-	return TRUE;
-}
+			if (!nvfx_buffer_seems_static(buffer))
+			{
+				index_hardware_cost = buffer->dirty_end - buffer->dirty_begin;
+				if (!buffer->base.bo)
+					index_hardware_cost += nvfx->screen->buffer_allocation_cost;
+			}
 
-// type must be floating point
-static inline void
-nvfx_vbo_static_attrib(struct nvfx_context *nvfx,
-		       int attrib, struct pipe_vertex_element *ve,
-		       struct pipe_vertex_buffer *vb, unsigned ncomp)
-{
-	struct pipe_transfer *transfer;
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	void *map;
-	float *v;
-
-	map  = pipe_buffer_map(&nvfx->pipe, vb->buffer, PIPE_TRANSFER_READ, &transfer);
-	map = (uint8_t *) map + vb->buffer_offset + ve->src_offset;
-
-	v = map;
-
-	switch (ncomp) {
-	case 4:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
-		OUT_RING(chan, fui(v[0]));
-		OUT_RING(chan, fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		OUT_RING(chan,  fui(v[3]));
-		break;
-	case 3:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		break;
-	case 2:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		break;
-	case 1:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
-		OUT_RING(chan,  fui(v[0]));
-		break;
+			if ((float) index_inline_cost < (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost)
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_inline_cost;
+			}
+			else
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost;
+				prefer_hardware_indices = TRUE;
+			}
+		}
 	}
 
-	pipe_buffer_unmap(&nvfx->pipe, vb->buffer, transfer);
+	/* let's finally figure out which of the 3 paths we want to take */
+	if ((float) (inline_cost + index_inline_cost) > ((float) hardware_cost * nvfx->screen->inline_cost_per_hardware_cost + best_index_cost_for_hardware_vertices_as_inline_cost))
+		upload_mode = 1 + prefer_hardware_indices;
+	else
+		upload_mode = 0;
+
+#ifdef DEBUG
+        if (unlikely(nvfx->screen->trace_draw))
+          {
+                  fprintf(stderr, "DRAW");
+                  if (info->indexed)
+                  {
+                          fprintf(stderr, "_IDX%u", nvfx->idxbuf.index_size);
+                          if (info->index_bias)
+                                  fprintf(stderr, " biased %u", info->index_bias);
+                          fprintf(stderr, " idxrange %u -> %u", info->min_index, info->max_index);
+                  }
+                  if (info->instance_count > 1)
+                          fprintf(stderr, " %u instances from %u", info->instance_count, info->indexed);
+                  fprintf(stderr, " start %u count %u prim %u", info->start, info->count, info->mode);
+                  if (!upload_mode)
+                          fprintf(stderr, " -> inline vertex data");
+                  else if (upload_mode == 2 || !info->indexed)
+                          fprintf(stderr, " -> buffer range");
+                  else
+                          fprintf(stderr, " -> inline indices");
+                  fprintf(stderr, " [ivtx %u hvtx %u iidx %u hidx %u bidx %f] <", inline_cost, hardware_cost, index_inline_cost, index_hardware_cost, best_index_cost_for_hardware_vertices_as_inline_cost);
+                  for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+                  {
+                          struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+                          struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+                          struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+                          if (i)
+                                  fprintf(stderr, ", ");
+                          fprintf(stderr, "%p%s left %Li", buffer, buffer->last_update_static ? " static" : "", buffer->bytes_to_draw_until_static);
+                  }
+                  fprintf(stderr, ">\n");
+          }
+#endif
+
+	return upload_mode;
 }
 
-static void
-nvfx_draw_arrays(struct pipe_context *pipe,
-		 unsigned mode, unsigned start, unsigned count)
+void nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
-
-	nvfx_vbo_set_idxbuf(nvfx, NULL, 0);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe, NULL, 0, 0,
-                                           mode, start, count);
-                return;
-	}
+	unsigned upload_mode = 0;
 
-	while (count) {
-		unsigned vc, nr, avail;
+	if (!nvfx->vtxelt->needs_translate)
+		upload_mode = nvfx_decide_upload_mode(pipe, info);
 
-		nvfx_state_emit(nvfx);
+	nvfx->use_index_buffer = upload_mode > 1;
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+	if ((upload_mode > 0) != nvfx->use_vertex_buffers)
+	{
+		nvfx->use_vertex_buffers = (upload_mode > 0);
+		nvfx->dirty |= NVFX_NEW_ARRAYS;
+		nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+	}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
+	if (upload_mode > 0)
+	{
+		for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			nvfx_buffer_upload(nvfx_buffer(vb->buffer));
 		}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		if (upload_mode > 1)
+		{
+			nvfx_buffer_upload(nvfx_buffer(nvfx->idxbuf.buffer));
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_VERTEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+			if (unlikely(info->index_bias != nvfx->base_vertex))
+			{
+				nvfx->base_vertex = info->index_bias;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
+			}
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_VERTEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			if (unlikely(info->start < nvfx->base_vertex && nvfx->base_vertex))
+			{
+				nvfx->base_vertex = 0;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
 			}
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		count -= vc;
-		start = restart;
 	}
 
-	pipe->flush(pipe, 0, NULL);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx))
+		nvfx_draw_vbo_swtnl(pipe, info);
+	else
+		nvfx_push_vbo(pipe, info);
 }
 
-static INLINE void
-nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+boolean
+nvfx_vbo_validate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	int i;
+	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	while (count) {
-		uint8_t *elts = (uint8_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
+	if (!elements)
+		return TRUE;
 
-		nvfx_state_emit(nvfx);
+	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
+	for(unsigned i = 0; i < nvfx->vtxelt->num_constant; ++i)
+	{
+		struct nvfx_low_frequency_element *ve = &nvfx->vtxelt->constant[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		float v[4];
+		ve->fetch_rgba_float(v, buffer->data + vb->buffer_offset + ve->src_offset, 0, 0);
+		nvfx_emit_vtx_attr(chan, ve->idx, v, ve->ncomp);
+	}
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
 
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			if(idx != ve->idx)
+			{
+				assert(idx < ve->idx);
+				OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], ve->idx - idx);
+				idx = ve->idx;
+			}
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RING(chan, nvfx->vtxelt->vtxfmt[idx] | (vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT));
+			++idx;
 		}
+		if(idx != nvfx->vtxelt->num_elements)
+			OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], nvfx->vtxelt->num_elements - idx);
+	}
+	else
+		OUT_RINGp(chan, nvfx->vtxelt->vtxfmt, nvfx->vtxelt->num_elements);
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+	for(i = nvfx->vtxelt->num_elements; i < elements; ++i)
+		OUT_RING(chan, NV34TCL_VTXFMT_TYPE_32_FLOAT);
 
-			vc -= push;
-			elts += push;
+	if(nvfx->is_nv4x) {
+		unsigned i;
+		/* seems to be some kind of cache flushing */
+		for(i = 0; i < 3; ++i) {
+			OUT_RING(chan, RING_3D(0x1718, 1));
+			OUT_RING(chan, 0);
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		start = restart;
 	}
-}
-
-static INLINE void
-nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
-{
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint16_t *elts = (uint16_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
-
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
 
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			for(; idx < ve->idx; ++idx)
+				OUT_RING(chan, 0);
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RELOC(chan, bo,
+					vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+			++idx;
 		}
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
-
-			vc -= push;
-			elts += push;
-		}
+		for(; idx < elements; ++idx)
+			OUT_RING(chan, 0);
+	}
+	else
+	{
+		for (i = 0; i < elements; i++)
+			OUT_RING(chan, 0);
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	OUT_RING(chan, RING_3D(0x1710, 1));
+	OUT_RING(chan, 0);
 
-		start = restart;
-	}
+	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_VTXBUF;
+	return TRUE;
 }
 
-static INLINE void
-nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_vbo_relocate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint32_t *elts = (uint32_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
-
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
-
-		vc = nouveau_vbuf_split(avail, 5, 1,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
-
-		while (vc) {
-			push = MIN2(vc, 2047);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
-			OUT_RINGp    (chan, elts, push);
+	struct nouveau_channel* chan;
+	unsigned vb_flags;
+	int i;
 
-			vc -= push;
-			elts += push;
-		}
+        if(!nvfx->use_vertex_buffers)
+                return;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	chan = nvfx->screen->base.channel;
+	vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
 
-		start = restart;
+	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
+        for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+                struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+                struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+                struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+
+                OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(ve->idx), 1),
+				vb_flags, 0, 0);
+                OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+				vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				0, NV34TCL_VTXBUF_ADDRESS_DMA1);
 	}
+        nvfx->relocs_needed &=~ NVFX_RELOCATE_VTXBUF;
 }
 
 static void
-nvfx_draw_elements_inline(struct pipe_context *pipe,
-			  struct pipe_resource *ib,
-			  unsigned ib_size, int ib_bias,
-			  unsigned mode, unsigned start, unsigned count)
+nvfx_idxbuf_emit(struct nvfx_context* nvfx, unsigned ib_flags)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *transfer;
-	void *map;
-
-	map = pipe_buffer_map(pipe, ib, PIPE_TRANSFER_READ, &transfer);
-	if (!ib) {
-		NOUVEAU_ERR("failed mapping ib\n");
-		return;
-	}
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned ib_format = (nvfx->idxbuf.index_size == 2) ? NV34TCL_IDXBUF_FORMAT_TYPE_U16 : NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+	struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf.buffer)->bo;
+	ib_flags |= nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	assert(ib_bias == 0);
-
-	switch (ib_size) {
-	case 1:
-		nvfx_draw_elements_u08(nvfx, map, mode, start, count);
-		break;
-	case 2:
-		nvfx_draw_elements_u16(nvfx, map, mode, start, count);
-		break;
-	case 4:
-		nvfx_draw_elements_u32(nvfx, map, mode, start, count);
-		break;
-	default:
-		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
-		break;
-	}
+	assert(nvfx->screen->index_buffer_reloc_flags);
 
-	pipe_buffer_unmap(pipe, ib, transfer);
+	MARK_RING(chan, 3, 3);
+	if(ib_flags & NOUVEAU_BO_DUMMY)
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2), ib_flags, 0, 0);
+	else
+		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
+	OUT_RELOC(chan, bo, nvfx->idxbuf.offset + 1, ib_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
+			0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_IDXBUF;
 }
 
-static void
-nvfx_draw_elements_vbo(struct pipe_context *pipe,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_idxbuf_validate(struct nvfx_context* nvfx)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
+	nvfx_idxbuf_emit(nvfx, 0);
+}
 
-	while (count) {
-		unsigned nr, vc, avail;
+void
+nvfx_idxbuf_relocate(struct nvfx_context* nvfx)
+{
+	nvfx_idxbuf_emit(nvfx, NOUVEAU_BO_DUMMY);
+}
 
-		nvfx_state_emit(nvfx);
+unsigned nvfx_vertex_formats[PIPE_FORMAT_COUNT] =
+{
+	[PIPE_FORMAT_R32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32A32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16A16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_USCALED] = NV34TCL_VTXFMT_TYPE_8_USCALED,
+	[PIPE_FORMAT_R16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16A16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16A16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+};
+
+static void *
+nvfx_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
+	struct translate_key transkey;
+	unsigned per_vertex_size[16];
+	unsigned vb_compacted_index[16];
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+	if(num_elements > 16)
+	{
+		_debug_printf("Error: application attempted to use %u vertex elements, but only 16 are supported: ignoring the rest\n", num_elements);
+		num_elements = 16;
+	}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
-		}
+	memset(per_vertex_size, 0, sizeof(per_vertex_size));
+	memcpy(cso->pipe, elements, num_elements * sizeof(elements[0]));
+	cso->num_elements = num_elements;
+	cso->needs_translate = FALSE;
+
+	transkey.nr_elements = 0;
+	transkey.output_stride = 0;
+
+	for(unsigned i = 0; i < num_elements; ++i)
+        {
+		const struct pipe_vertex_element* ve = &elements[i];
+		if(!ve->instance_divisor)
+                        per_vertex_size[ve->vertex_buffer_index] += util_format_get_stride(ve->src_format, 1);
+        }
+
+        for(unsigned i = 0; i < 16; ++i)
+        {
+                if(per_vertex_size[i])
+                {
+                        unsigned idx = cso->num_per_vertex_buffer_infos++;
+                        cso->per_vertex_buffer_info[idx].vertex_buffer_index = i;
+                        cso->per_vertex_buffer_info[idx].per_vertex_size = per_vertex_size[i];
+                        vb_compacted_index[i] = idx;
+                }
+        }
+
+	for(unsigned i = 0; i < num_elements; ++i)
+	{
+		const struct pipe_vertex_element* ve = &elements[i];
+		unsigned type = nvfx_vertex_formats[ve->src_format];
+		unsigned ncomp = util_format_get_nr_components(ve->src_format);
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		//if(ve->frequency != PIPE_ELEMENT_FREQUENCY_PER_VERTEX)
+		if(ve->instance_divisor)
+		{
+			struct nvfx_low_frequency_element* lfve;
+			cso->vtxfmt[i] = NV34TCL_VTXFMT_TYPE_32_FLOAT;
+
+			//if(ve->frequency == PIPE_ELEMENT_FREQUENCY_CONSTANT)
+			if(0)
+				lfve = &cso->constant[cso->num_constant++];
+			else
+			{
+				lfve = &cso->per_instance[cso->num_per_instance++].base;
+				((struct nvfx_per_instance_element*)lfve)->instance_divisor = ve->instance_divisor;
+			}
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_INDEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+                        lfve->idx = i;
+                        lfve->vertex_buffer_index = ve->vertex_buffer_index;
+                        lfve->src_offset = ve->src_offset;
+                        lfve->fetch_rgba_float = util_format_description(ve->src_format)->fetch_rgba_float;
+                        lfve->ncomp = ncomp;
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_INDEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			unsigned idx;
+
+			idx = cso->num_per_vertex++;
+			cso->per_vertex[idx].idx = i;
+			cso->per_vertex[idx].vertex_buffer_index = ve->vertex_buffer_index;
+			cso->per_vertex[idx].src_offset = ve->src_offset;
+
+			idx = transkey.nr_elements++;
+			transkey.element[idx].input_format = ve->src_format;
+			transkey.element[idx].input_buffer = vb_compacted_index[ve->vertex_buffer_index];
+			transkey.element[idx].input_offset = ve->src_offset;
+			transkey.element[idx].instance_divisor = 0;
+			transkey.element[idx].type = TRANSLATE_ELEMENT_NORMAL;
+			if(type)
+			{
+				transkey.element[idx].output_format = ve->src_format;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type;
+			}
+			else
+			{
+				unsigned float32[4] = {PIPE_FORMAT_R32_FLOAT, PIPE_FORMAT_R32G32_FLOAT, PIPE_FORMAT_R32G32B32_FLOAT, PIPE_FORMAT_R32G32B32A32_FLOAT};
+				transkey.element[idx].output_format = float32[ncomp - 1];
+				cso->needs_translate = TRUE;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | NV34TCL_VTXFMT_TYPE_32_FLOAT;
 			}
+			transkey.element[idx].output_offset = transkey.output_stride;
+			transkey.output_stride += (util_format_get_stride(transkey.element[idx].output_format, 1) + 3) & ~3;
 		}
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	cso->translate = translate_create(&transkey);
+	cso->vertex_length = transkey.output_stride >> 2;
+	cso->max_vertices_per_packet = 2047 / cso->vertex_length;
 
-		count -= vc;
-		start = restart;
-	}
+	return (void *)cso;
 }
 
 static void
-nvfx_draw_elements(struct pipe_context *pipe,
-		   struct pipe_resource *indexBuffer,
-		   unsigned indexSize, int indexBias,
-		   unsigned mode, unsigned start, unsigned count)
+nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	boolean idxbuf;
-
-	idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe,
-		                         indexBuffer, indexSize, indexBias,
-		                         mode, start, count);
-		return;
-	}
-
-	if (idxbuf) {
-		nvfx_draw_elements_vbo(pipe, mode, start, count);
-	} else {
-		nvfx_draw_elements_inline(pipe,
-		                          indexBuffer, indexSize, indexBias,
-					  mode, start, count);
-	}
-
-	pipe->flush(pipe, 0, NULL);
+	FREE(hwcso);
 }
 
-void
-nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+static void
+nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-	if (info->indexed && nvfx->idxbuf.buffer) {
-		unsigned offset;
-
-		assert(nvfx->idxbuf.offset % nvfx->idxbuf.index_size == 0);
-		offset = nvfx->idxbuf.offset / nvfx->idxbuf.index_size;
-
-		nvfx_draw_elements(pipe,
-				   nvfx->idxbuf.buffer,
-				   nvfx->idxbuf.index_size,
-				   info->index_bias,
-				   info->mode,
-				   info->start + offset,
-				   info->count);
-	}
-	else {
-		nvfx_draw_arrays(pipe,
-				info->mode,
-				info->start,
-				info->count);
-	}
+	nvfx->vtxelt = hwcso;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
 }
 
-boolean
-nvfx_vbo_validate(struct nvfx_context *nvfx)
+static void
+nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct pipe_resource *ib = nvfx->idxbuf_buffer;
-	unsigned ib_format = nvfx->idxbuf_format;
-	int i;
-	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
-	uint32_t vtxfmt[16];
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
-
-	if (!elements)
-		return TRUE;
-
-	nvfx->vbo_bo = 0;
-
-	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
-		unsigned type, ncomp;
-
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-
-		if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
-			MARK_UNDO(chan);
-			nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS;
-			return FALSE;
-		}
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!vb->stride && type == NV34TCL_VTXFMT_TYPE_FLOAT) {
-			nvfx_vbo_static_attrib(nvfx, i, ve, vb, ncomp);
-			vtxfmt[i] = type;
-		} else {
-			vtxfmt[i] = ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
-				(ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type);
-			nvfx->vbo_bo |= (1 << i);
-		}
+	for(unsigned i = 0; i < count; ++i)
+	{
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, vb[i].buffer);
+		nvfx->vtxbuf[i].buffer_offset = vb[i].buffer_offset;
+		nvfx->vtxbuf[i].max_index = vb[i].max_index;
+		nvfx->vtxbuf[i].stride = vb[i].stride;
 	}
 
-	for(; i < elements; ++i)
-		vtxfmt[i] = NV34TCL_VTXFMT_TYPE_FLOAT;
+	for(unsigned i = count; i < nvfx->vtxbuf_nr; ++i)
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
 
-	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
-	OUT_RINGp(chan, vtxfmt, elements);
-
-	if(nvfx->is_nv4x) {
-		unsigned i;
-		/* seems to be some kind of cache flushing */
-		for(i = 0; i < 3; ++i) {
-			OUT_RING(chan, RING_3D(0x1718, 1));
-			OUT_RING(chan, 0);
-		}
-	}
-
-	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
+	nvfx->vtxbuf_nr = count;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
 
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+static void
+nvfx_set_index_buffer(struct pipe_context *pipe,
+		      const struct pipe_index_buffer *ib)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!(nvfx->vbo_bo & (1 << i)))
-			OUT_RING(chan, 0);
-		else
-		{
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo,
-				 vb->buffer_offset + ve->src_offset,
-				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-				 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
+	if(ib)
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, ib->buffer);
+		nvfx->idxbuf.index_size = ib->index_size;
+		nvfx->idxbuf.offset = ib->offset;
 	}
-
-        for (; i < elements; i++)
-		OUT_RING(chan, 0);
-
-	OUT_RING(chan, RING_3D(0x1710, 1));
-	OUT_RING(chan, 0);
-
-	if (ib) {
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
-		struct nouveau_bo* bo = nvfx_resource(ib)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
-		OUT_RELOC(chan, bo, 0, ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
-				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	else
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
+		nvfx->idxbuf.index_size = 0;
+		nvfx->idxbuf.offset = 0;
 	}
 
-	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
-	return TRUE;
+	nvfx->dirty |= NVFX_NEW_INDEX;
+	nvfx->draw_dirty |= NVFX_NEW_INDEX;
 }
 
 void
-nvfx_vbo_relocate(struct nvfx_context *nvfx)
+nvfx_init_vbo_functions(struct nvfx_context *nvfx)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-	int i;
+	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
+	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 
-	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
-	for(i = 0; i < nvfx->vtxelt->num_elements; ++i) {
-		if(nvfx->vbo_bo & (1 << i)) {
-			struct pipe_vertex_element *ve = &nvfx->vtxelt->pipe[i];
-			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(i), 1),
-					vb_flags, 0, 0);
-			OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset,
-					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
-	}
-
-	if(nvfx->idxbuf_buffer)
-	{
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-		struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf_buffer)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2),
-				ib_flags, 0, 0);
-		OUT_RELOC(chan, bo, 0,
-				ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, nvfx->idxbuf_format,
-				ib_flags | NOUVEAU_BO_OR,
-				0, NV34TCL_IDXBUF_FORMAT_DMA1);
-	}
+	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
+	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
+	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 24d9846310..ea7e88c561 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -1,15 +1,19 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
-#include "util/u_inlines.h"
+#include "util/u_linkage.h"
+#include "util/u_debug.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_util.h"
 
+#include "draw/draw_context.h"
+
 #include "nvfx_context.h"
 #include "nvfx_state.h"
+#include "nvfx_resource.h"
 
 /* TODO (at least...):
  *  1. Indexed consts  + ARL
@@ -25,26 +29,34 @@
 #include "nv30_vertprog.h"
 #include "nv40_vertprog.h"
 
-#define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+struct nvfx_loop_entry
+{
+	unsigned brk_target;
+	unsigned cont_target;
+};
 
 struct nvfx_vpc {
+	struct nvfx_context* nvfx;
 	struct nvfx_vertex_program *vp;
 
 	struct nvfx_vertex_program_exec *vpi;
 
 	unsigned r_temps;
 	unsigned r_temps_discard;
-	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
-	struct nvfx_sreg *r_address;
-	struct nvfx_sreg *r_temp;
+	struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_reg *r_address;
+	struct nvfx_reg *r_temp;
 
-	struct nvfx_sreg *imm;
+	struct nvfx_reg *imm;
 	unsigned nr_imm;
 
 	unsigned hpos_idx;
+
+	struct util_dynarray label_relocs;
+	struct util_dynarray loop_stack;
 };
 
-static struct nvfx_sreg
+static struct nvfx_reg
 temp(struct nvfx_vpc *vpc)
 {
 	int idx = ffs(~vpc->r_temps) - 1;
@@ -52,22 +64,22 @@ temp(struct nvfx_vpc *vpc)
 	if (idx < 0) {
 		NOUVEAU_ERR("out of temps!!\n");
 		assert(0);
-		return nvfx_sr(NVFXSR_TEMP, 0);
+		return nvfx_reg(NVFXSR_TEMP, 0);
 	}
 
 	vpc->r_temps |= (1 << idx);
 	vpc->r_temps_discard |= (1 << idx);
-	return nvfx_sr(NVFXSR_TEMP, idx);
+	return nvfx_reg(NVFXSR_TEMP, idx);
 }
 
-static INLINE void
+static inline void
 release_temps(struct nvfx_vpc *vpc)
 {
 	vpc->r_temps &= ~vpc->r_temps_discard;
 	vpc->r_temps_discard = 0;
 }
 
-static struct nvfx_sreg
+static struct nvfx_reg
 constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
 {
 	struct nvfx_vertex_program *vp = vpc->vp;
@@ -77,7 +89,7 @@ constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
 	if (pipe >= 0) {
 		for (idx = 0; idx < vp->nr_consts; idx++) {
 			if (vp->consts[idx].index == pipe)
-				return nvfx_sr(NVFXSR_CONST, idx);
+				return nvfx_reg(NVFXSR_CONST, idx);
 		}
 	}
 
@@ -90,35 +102,36 @@ constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
 	vpd->value[1] = y;
 	vpd->value[2] = z;
 	vpd->value[3] = w;
-	return nvfx_sr(NVFXSR_CONST, idx);
+	return nvfx_reg(NVFXSR_CONST, idx);
 }
 
-#define arith(cc,s,o,d,m,s0,s1,s2) \
-	nvfx_vp_arith(nvfx, (cc), NVFX_VP_INST_SLOT_##s, NVFX_VP_INST_##s##_OP_##o, (d), (m), (s0), (s1), (s2))
+#define arith(s,o,d,m,s0,s1,s2) \
+	nvfx_insn(0, (NVFX_VP_INST_SLOT_##s << 7) | NVFX_VP_INST_##s##_OP_##o, -1, (d), (m), (s0), (s1), (s2))
 
 static void
-emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_sreg src)
+emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_src src)
 {
 	struct nvfx_vertex_program *vp = vpc->vp;
 	uint32_t sr = 0;
+	struct nvfx_relocation reloc;
 
-	switch (src.type) {
+	switch (src.reg.type) {
 	case NVFXSR_TEMP:
 		sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
-		sr |= (src.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
+		sr |= (src.reg.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
 		break;
 	case NVFXSR_INPUT:
 		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
 		       NVFX_VP(SRC_REG_TYPE_SHIFT));
-		vp->ir |= (1 << src.index);
-		hw[1] |= (src.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
+		vp->ir |= (1 << src.reg.index);
+		hw[1] |= (src.reg.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
 		break;
 	case NVFXSR_CONST:
 		sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
 		       NVFX_VP(SRC_REG_TYPE_SHIFT));
-		assert(vpc->vpi->const_index == -1 ||
-		       vpc->vpi->const_index == src.index);
-		vpc->vpi->const_index = src.index;
+		reloc.location = vp->nr_insns - 1;
+		reloc.target = src.reg.index;
+		util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc);
 		break;
 	case NVFXSR_NONE:
 		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
@@ -161,100 +174,67 @@ emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos,
 }
 
 static void
-emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_sreg dst)
+emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_reg dst)
 {
 	struct nvfx_vertex_program *vp = vpc->vp;
 
 	switch (dst.type) {
+	case NVFXSR_NONE:
+		if(!nvfx->is_nv4x)
+			hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK;
+		else {
+			hw[3] |= NV40_VP_INST_DEST_MASK;
+			if (slot == 0)
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
+			else
+				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
 	case NVFXSR_TEMP:
 		if(!nvfx->is_nv4x)
 			hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
 		else {
 			hw[3] |= NV40_VP_INST_DEST_MASK;
-			if (slot == 0) {
-				hw[0] |= (dst.index <<
-					  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
-			} else {
-				hw[3] |= (dst.index <<
-					  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
-			}
+			if (slot == 0)
+				hw[0] |= (dst.index << NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+			else
+				hw[3] |= (dst.index << NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
 		}
 		break;
 	case NVFXSR_OUTPUT:
 		/* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
-		switch (dst.index) {
-		case NVFX_VP_INST_DEST_CLIP(0):
-			vp->or |= (1 << 6);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(1):
-			vp->or |= (1 << 7);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(2):
-			vp->or |= (1 << 8);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(3):
-			vp->or |= (1 << 9);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(4):
-			vp->or |= (1 << 10);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(5):
-			vp->or |= (1 << 11);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		default:
-			if(!nvfx->is_nv4x) {
-				switch (dst.index) {
-				case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-				case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-				case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-				case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-				case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-				case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
-				}
-			} else {
-				switch (dst.index) {
-				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-				case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
-				}
+		if(nvfx->is_nv4x) {
+			switch (dst.index) {
+			case NV30_VP_INST_DEST_CLP(0):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(1):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(2):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(3):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV30_VP_INST_DEST_CLP(4):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV30_VP_INST_DEST_CLP(5):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+			case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+			case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+			case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+			case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+			case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
 			}
-			break;
 		}
 
 		if(!nvfx->is_nv4x) {
 			hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
-			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK;
 
 			/*XXX: no way this is entirely correct, someone needs to
 			 *     figure out what exactly it is.
@@ -264,7 +244,7 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 			hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
 			if (slot == 0) {
 				hw[0] |= NV40_VP_INST_VEC_RESULT;
-				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
 			} else {
 				hw[3] |= NV40_VP_INST_SCA_RESULT;
 				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
@@ -277,26 +257,27 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 }
 
 static void
-nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op,
-	      struct nvfx_sreg dst, int mask,
-	      struct nvfx_sreg s0, struct nvfx_sreg s1,
-	      struct nvfx_sreg s2)
+nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
 {
+	struct nvfx_context* nvfx = vpc->nvfx;
 	struct nvfx_vertex_program *vp = vpc->vp;
+	unsigned slot = insn.op >> 7;
+	unsigned op = insn.op & 0x7f;
 	uint32_t *hw;
 
 	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
 	vpc->vpi = &vp->insns[vp->nr_insns - 1];
 	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
-	vpc->vpi->const_index = -1;
 
 	hw = vpc->vpi->data;
 
-	hw[0] |= (NVFX_COND_TR << NVFX_VP(INST_COND_SHIFT));
-	hw[0] |= ((0 << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
-		  (1 << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
-		  (2 << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
-		  (3 << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+	hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT));
+	hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
+		  (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
+		  (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
+		  (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+	if(insn.cc_update)
+		hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE);
 
 	if(!nvfx->is_nv4x) {
 		if(slot == 0)
@@ -309,54 +290,56 @@ nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op,
 //		hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
 //		hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
 
-		if (dst.type == NVFXSR_OUTPUT) {
+		if (insn.dst.type == NVFXSR_OUTPUT) {
 			if (slot)
-				hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
 			else
-				hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
 		} else {
 			if (slot)
-				hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
 			else
-				hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
 		}
 	 } else {
 		if (slot == 0) {
 			hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
 			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
-			hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+			hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
 	    } else {
 			hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
-			hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
-			hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ;
+			hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
 		}
 	}
 
-	emit_dst(nvfx, vpc, hw, slot, dst);
-	emit_src(nvfx, vpc, hw, 0, s0);
-	emit_src(nvfx, vpc, hw, 1, s1);
-	emit_src(nvfx, vpc, hw, 2, s2);
+	emit_dst(nvfx, vpc, hw, slot, insn.dst);
+	emit_src(nvfx, vpc, hw, 0, insn.src[0]);
+	emit_src(nvfx, vpc, hw, 1, insn.src[1]);
+	emit_src(nvfx, vpc, hw, 2, insn.src[2]);
 }
 
-static INLINE struct nvfx_sreg
+static inline struct nvfx_src
 tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
-	struct nvfx_sreg src = { 0 };
+	struct nvfx_src src;
 
 	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		src = nvfx_sr(NVFXSR_INPUT, fsrc->Register.Index);
+		src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
+		src.reg = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
 		break;
 	case TGSI_FILE_IMMEDIATE:
-		src = vpc->imm[fsrc->Register.Index];
+		src.reg = vpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		src = vpc->r_temp[fsrc->Register.Index];
+		src.reg = vpc->r_temp[fsrc->Register.Index];
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
+		src.reg.index = 0;
+		src.reg.type = 0;
 		break;
 	}
 
@@ -369,11 +352,14 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 	return src;
 }
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
-	struct nvfx_sreg dst = { 0 };
+	struct nvfx_reg dst;
 
 	switch (fdst->Register.File) {
+	case TGSI_FILE_NULL:
+		dst = nvfx_reg(NVFXSR_NONE, 0);
+		break;
 	case TGSI_FILE_OUTPUT:
 		dst = vpc->r_result[fdst->Register.Index];
 		break;
@@ -384,14 +370,16 @@ tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
 		dst = vpc->r_address[fdst->Register.Index];
 		break;
 	default:
-		NOUVEAU_ERR("bad dst file\n");
+		NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File);
+		dst.index = 0;
+		dst.type = 0;
 		break;
 	}
 
 	return dst;
 }
 
-static INLINE int
+static inline int
 tgsi_mask(uint tgsi)
 {
 	int mask = 0;
@@ -405,10 +393,14 @@ tgsi_mask(uint tgsi)
 
 static boolean
 nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
-				const struct tgsi_full_instruction *finst)
+				unsigned idx, const struct tgsi_full_instruction *finst)
 {
-	struct nvfx_sreg src[3], dst, tmp;
-	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	struct nvfx_src src[3], tmp;
+	struct nvfx_reg dst;
+	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn;
+	struct nvfx_relocation reloc;
+	struct nvfx_loop_entry loop;
 	int mask;
 	int ai = -1, ci = -1, ii = -1;
 	int i;
@@ -436,9 +428,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
-				src[i] = temp(vpc);
-				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
-				      tgsi_src(vpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(vpc));
+				nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_CONSTANT:
@@ -447,9 +438,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
-				src[i] = temp(vpc);
-				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
-				      tgsi_src(vpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(vpc));
+				nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_IMMEDIATE:
@@ -458,9 +448,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 				ii = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
-				src[i] = temp(vpc);
-				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
-				      tgsi_src(vpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(vpc));
+				nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_TEMPORARY:
@@ -477,128 +466,231 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 
 	switch (finst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
-		arith(vpc, VEC, MOV, dst, mask, abs(src[0]), none, none);
+		nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, abs(src[0]), none, none));
 		break;
 	case TGSI_OPCODE_ADD:
-		arith(vpc, VEC, ADD, dst, mask, src[0], none, src[1]);
+		nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, src[1]));
 		break;
 	case TGSI_OPCODE_ARL:
-		arith(vpc, VEC, ARL, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, ARL, dst, mask, src[0], none, none));
+		break;
+	case TGSI_OPCODE_CMP:
+		insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_vp_emit(vpc, insn);
+
+		insn = arith(VEC, MOV, dst, mask, src[2], none, none);
+		insn.cc_test = NVFX_COND_GE;
+		nvfx_vp_emit(vpc, insn);
+
+		insn = arith(VEC, MOV, dst, mask, src[1], none, none);
+		insn.cc_test = NVFX_COND_LT;
+		nvfx_vp_emit(vpc, insn);
 		break;
 	case TGSI_OPCODE_COS:
-		arith(vpc, SCA, COS, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, COS, dst, mask, none, none, src[0]));
 		break;
+        case TGSI_OPCODE_DP2:
+                tmp = nvfx_src(temp(vpc));
+                nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none));
+                nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
+                break;
 	case TGSI_OPCODE_DP3:
-		arith(vpc, VEC, DP3, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DP3, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DP4:
-		arith(vpc, VEC, DP4, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DP4, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DPH:
-		arith(vpc, VEC, DPH, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DPH, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DST:
-		arith(vpc, VEC, DST, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DST, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_EX2:
-		arith(vpc, SCA, EX2, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_EXP:
-		arith(vpc, SCA, EXP, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, EXP, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_FLR:
-		arith(vpc, VEC, FLR, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, FLR, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_FRC:
-		arith(vpc, VEC, FRC, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, FRC, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_LG2:
-		arith(vpc, SCA, LG2, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, LG2, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_LIT:
-		arith(vpc, SCA, LIT, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, LIT, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_LOG:
-		arith(vpc, SCA, LOG, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, LOG, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_LRP:
-		tmp = temp(vpc);
-		arith(vpc, VEC, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
-		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], tmp);
+		tmp = nvfx_src(temp(vpc));
+		nvfx_vp_emit(vpc, arith(VEC, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
+		nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], tmp));
 		break;
 	case TGSI_OPCODE_MAD:
-		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], src[2]);
+		nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], src[2]));
 		break;
 	case TGSI_OPCODE_MAX:
-		arith(vpc, VEC, MAX, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, MAX, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MIN:
-		arith(vpc, VEC, MIN, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, MIN, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MOV:
-		arith(vpc, VEC, MOV, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_MUL:
-		arith(vpc, VEC, MUL, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, MUL, dst, mask, src[0], src[1], none));
+		break;
+	case TGSI_OPCODE_NOP:
 		break;
 	case TGSI_OPCODE_POW:
-		tmp = temp(vpc);
-		arith(vpc, SCA, LG2, tmp, NVFX_VP_MASK_X, none, none,
-		      swz(src[0], X, X, X, X));
-		arith(vpc, VEC, MUL, tmp, NVFX_VP_MASK_X, swz(tmp, X, X, X, X),
-		      swz(src[1], X, X, X, X), none);
-		arith(vpc, SCA, EX2, dst, mask, none, none,
-		      swz(tmp, X, X, X, X));
+		tmp = nvfx_src(temp(vpc));
+		nvfx_vp_emit(vpc, arith(SCA, LG2, tmp.reg, NVFX_VP_MASK_X, none, none, swz(src[0], X, X, X, X)));
+		nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
+		nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, swz(tmp, X, X, X, X)));
 		break;
 	case TGSI_OPCODE_RCP:
-		arith(vpc, SCA, RCP, dst, mask, none, none, src[0]);
-		break;
-	case TGSI_OPCODE_RET:
+		nvfx_vp_emit(vpc, arith(SCA, RCP, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_RSQ:
-		arith(vpc, SCA, RSQ, dst, mask, none, none, abs(src[0]));
+		nvfx_vp_emit(vpc, arith(SCA, RSQ, dst, mask, none, none, abs(src[0])));
 		break;
 	case TGSI_OPCODE_SEQ:
-		arith(vpc, VEC, SEQ, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SEQ, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SFL:
-		arith(vpc, VEC, SFL, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SFL, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGE:
-		arith(vpc, VEC, SGE, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SGE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGT:
-		arith(vpc, VEC, SGT, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SGT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SIN:
-		arith(vpc, SCA, SIN, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, SIN, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_SLE:
-		arith(vpc, VEC, SLE, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SLE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SLT:
-		arith(vpc, VEC, SLT, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SLT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SNE:
-		arith(vpc, VEC, SNE, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SNE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SSG:
-		arith(vpc, VEC, SSG, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SSG, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_STR:
-		arith(vpc, VEC, STR, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, STR, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SUB:
-		arith(vpc, VEC, ADD, dst, mask, src[0], none, neg(src[1]));
+		nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, neg(src[1])));
 		break;
+        case TGSI_OPCODE_TRUNC:
+                tmp = nvfx_src(temp(vpc));
+                insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
+                insn.cc_update = 1;
+                nvfx_vp_emit(vpc, insn);
+
+                nvfx_vp_emit(vpc, arith(VEC, FLR, tmp.reg, mask, abs(src[0]), none, none));
+                nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, tmp, none, none));
+
+                insn = arith(VEC, MOV, dst, mask, neg(tmp), none, none);
+                insn.cc_test = NVFX_COND_LT;
+                nvfx_vp_emit(vpc, insn);
+                break;
 	case TGSI_OPCODE_XPD:
-		tmp = temp(vpc);
-		arith(vpc, VEC, MUL, tmp, mask,
-		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
-		arith(vpc, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W),
-		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
-		      neg(tmp));
+		tmp = nvfx_src(temp(vpc));
+		nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
+		nvfx_vp_emit(vpc, arith(VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
+		break;
+
+	case TGSI_OPCODE_IF:
+		insn = arith(VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_vp_emit(vpc, insn);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = finst->Label.Label + 1;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		insn = arith(SCA, BRA, none.reg, 0, none, none, none);
+		insn.cc_test = NVFX_COND_EQ;
+		insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0;
+		nvfx_vp_emit(vpc, insn);
 		break;
+
+	case TGSI_OPCODE_ELSE:
+	case TGSI_OPCODE_BRA:
+	case TGSI_OPCODE_CAL:
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = finst->Label.Label;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
+			insn = arith(SCA, CAL, none.reg, 0, none, none, none);
+		else
+			insn = arith(SCA, BRA, none.reg, 0, none, none, none);
+		nvfx_vp_emit(vpc, insn);
+		break;
+
+	case TGSI_OPCODE_RET:
+		tmp = none;
+		tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0;
+		nvfx_vp_emit(vpc, arith(SCA, RET, none.reg, 0, none, none, tmp));
+		break;
+
+	case TGSI_OPCODE_BGNSUB:
+	case TGSI_OPCODE_ENDSUB:
+	case TGSI_OPCODE_ENDIF:
+		/* nothing to do here */
+		break;
+
+	case TGSI_OPCODE_BGNLOOP:
+		loop.cont_target = idx;
+		loop.brk_target = finst->Label.Label + 1;
+		util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop);
+		break;
+
+	case TGSI_OPCODE_ENDLOOP:
+		loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.cont_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
+	case TGSI_OPCODE_CONT:
+		loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.cont_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
+	case TGSI_OPCODE_BRK:
+		loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.brk_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
 	default:
 		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
 		return FALSE;
@@ -649,12 +741,8 @@ nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 		hw = NVFX_VP(INST_DEST_PSZ);
 		break;
 	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.Index <= 7) {
-			hw = NVFX_VP(INST_DEST_TC(fdec->Semantic.Index));
-		} else {
-			NOUVEAU_ERR("bad generic semantic index\n");
-			return FALSE;
-		}
+		hw = (vpc->vp->generic_to_fp_input[fdec->Semantic.Index] & 0xf)
+			+ NVFX_VP(INST_DEST_TC(0)) - NVFX_FP_OP_INPUT_SRC_TC(0);
 		break;
 	case TGSI_SEMANTIC_EDGEFLAG:
 		/* not really an error just a fallback */
@@ -665,7 +753,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 		return FALSE;
 	}
 
-	vpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
 	return TRUE;
 }
 
@@ -674,6 +762,36 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 {
 	struct tgsi_parse_context p;
 	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+	struct util_semantic_set set;
+	unsigned char sem_layout[8];
+	unsigned num_outputs;
+
+	num_outputs = util_semantic_set_from_program_file(&set, vpc->vp->pipe.tokens, TGSI_FILE_OUTPUT);
+
+	if(num_outputs > 8) {
+		NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs);
+		return FALSE;
+	}
+	util_semantic_layout_from_set(sem_layout, &set, 8, 8);
+
+	/* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
+	memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
+	for(int i = 0; i < 8; ++i) {
+		if(sem_layout[i] == 0xff)
+			continue;
+		//printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
+		vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
+	}
+
+	vpc->vp->sprite_fp_input = -1;
+	for(int i = 0; i < 8; ++i)
+	{
+		if(sem_layout[i] == 0xff)
+		{
+			vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
+			break;
+		}
+	}
 
 	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
@@ -737,18 +855,18 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 	tgsi_parse_free(&p);
 
 	if (nr_imm) {
-		vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_sreg));
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_reg));
 		assert(vpc->imm);
 	}
 
 	if (++high_temp) {
-		vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
 		for (i = 0; i < high_temp; i++)
 			vpc->r_temp[i] = temp(vpc);
 	}
 
 	if (++high_addr) {
-		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_sreg));
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg));
 		for (i = 0; i < high_addr; i++)
 			vpc->r_address[i] = temp(vpc);
 	}
@@ -757,20 +875,31 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 	return TRUE;
 }
 
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
+
 static void
 nvfx_vertprog_translate(struct nvfx_context *nvfx,
 			struct nvfx_vertex_program *vp)
 {
 	struct tgsi_parse_context parse;
 	struct nvfx_vpc *vpc = NULL;
-	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct util_dynarray insns;
 	int i;
 
 	vpc = CALLOC(1, sizeof(struct nvfx_vpc));
 	if (!vpc)
 		return;
+	vpc->nvfx = nvfx;
 	vpc->vp = vp;
 
+	/* reserve space for ucps */
+	if(nvfx->use_vp_clipping)
+	{
+		for(i = 0; i < 6; ++i)
+			constant(vpc, -1, 0, 0, 0, 0);
+	}
+
 	if (!nvfx_vertprog_prepare(nvfx, vpc)) {
 		FREE(vpc);
 		return;
@@ -780,13 +909,15 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 	 * planes are enabled.  We need to append code to the vtxprog
 	 * to handle clip planes later.
 	 */
-	if (vp->ucp.nr)  {
+	/* TODO: maybe support patching this depending on whether there are ucps: not sure if it is really matters much */
+	if (nvfx->use_vp_clipping)  {
 		vpc->r_result[vpc->hpos_idx] = temp(vpc);
 		vpc->r_temps_discard = 0;
 	}
 
 	tgsi_parse_init(&parse, vp->pipe.tokens);
 
+	util_dynarray_init(&insns);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
 
@@ -809,8 +940,10 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 		{
 			const struct tgsi_full_instruction *finst;
+			unsigned idx = insns.size >> 2;
+			util_dynarray_append(&insns, unsigned, vp->nr_insns);
 			finst = &parse.FullToken.FullInstruction;
-			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, finst))
+			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, idx, finst))
 				goto out_err;
 		}
 			break;
@@ -819,43 +952,87 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		}
 	}
 
+	util_dynarray_append(&insns, unsigned, vp->nr_insns);
+
+	for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
+	{
+		struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i);
+		struct nvfx_relocation hw_reloc;
+
+		hw_reloc.location = label_reloc->location;
+		hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
+
+		//debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
+
+		util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc);
+	}
+	util_dynarray_fini(&insns);
+	util_dynarray_trim(&vp->branch_relocs);
+
+	/* XXX: what if we add a RET before?!  make sure we jump here...*/
+
 	/* Write out HPOS if it was redirected to a temp earlier */
 	if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
-		struct nvfx_sreg hpos = nvfx_sr(NVFXSR_OUTPUT,
+		struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT,
 						NVFX_VP(INST_DEST_POS));
-		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
 
-		arith(vpc, VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none);
+		nvfx_vp_emit(vpc, arith(VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none));
 	}
 
 	/* Insert code to handle user clip planes */
-	for (i = 0; i < vp->ucp.nr; i++) {
-		struct nvfx_sreg cdst = nvfx_sr(NVFXSR_OUTPUT,
-						NVFX_VP_INST_DEST_CLIP(i));
-		struct nvfx_sreg ceqn = constant(vpc, -1,
-						 nvfx->clip.ucp[i][0],
-						 nvfx->clip.ucp[i][1],
-						 nvfx->clip.ucp[i][2],
-						 nvfx->clip.ucp[i][3]);
-		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
-		unsigned mask;
-
-		switch (i) {
-		case 0: case 3: mask = NVFX_VP_MASK_Y; break;
-		case 1: case 4: mask = NVFX_VP_MASK_Z; break;
-		case 2: case 5: mask = NVFX_VP_MASK_W; break;
-		default:
-			NOUVEAU_ERR("invalid clip dist #%d\n", i);
-			goto out_err;
+	if(nvfx->use_vp_clipping)
+	{
+		for (i = 0; i < 6; i++) {
+			struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i));
+			struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, i));
+			struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
+			unsigned mask;
+
+			if(nvfx->is_nv4x)
+			{
+				switch (i) {
+				case 0: case 3: mask = NVFX_VP_MASK_Y; break;
+				case 1: case 4: mask = NVFX_VP_MASK_Z; break;
+				case 2: case 5: mask = NVFX_VP_MASK_W; break;
+				default:
+					NOUVEAU_ERR("invalid clip dist #%d\n", i);
+					goto out_err;
+				}
+			}
+			else
+				mask = NVFX_VP_MASK_X;
+
+			nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
 		}
+	}
+	else
+	{
+		if(vp->nr_insns)
+			vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
 
-		arith(vpc, VEC, DP4, cdst, mask, htmp, ceqn, none);
+		nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
+		vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
 	}
 
-	vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+	if(debug_get_option_nvfx_dump_vp())
+	{
+		debug_printf("\n");
+		tgsi_dump(vp->pipe.tokens, 0);
+
+		debug_printf("\n%s vertex program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
+		for (i = 0; i < vp->nr_insns; i++)
+			debug_printf("%3u: %08x %08x %08x %08x\n", i, vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
+		debug_printf("\n");
+	}
+
+	vp->clip_nr = -1;
+	vp->exec_start = -1;
 	vp->translated = TRUE;
 out_err:
 	tgsi_parse_free(&parse);
+	util_dynarray_fini(&vpc->label_relocs);
+	util_dynarray_fini(&vpc->loop_stack);
 	if (vpc->r_temp)
 		FREE(vpc->r_temp);
 	if (vpc->r_address)
@@ -868,26 +1045,17 @@ out_err:
 boolean
 nvfx_vertprog_validate(struct nvfx_context *nvfx)
 {
-	struct pipe_context *pipe = &nvfx->pipe;
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
 	struct nouveau_grobj *eng3d = screen->eng3d;
 	struct nvfx_vertex_program *vp;
 	struct pipe_resource *constbuf;
-	struct pipe_transfer *transfer = NULL;
 	boolean upload_code = FALSE, upload_data = FALSE;
 	int i;
 
 	if (nvfx->render_mode == HW) {
 		vp = nvfx->vertprog;
 		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
-
-		// TODO: ouch! can't we just use constant slots for these?!
-		if ((nvfx->dirty & NVFX_NEW_UCP) ||
-		    memcmp(&nvfx->clip, &vp->ucp, sizeof(vp->ucp))) {
-			nvfx_vertprog_destroy(nvfx, vp);
-			memcpy(&vp->ucp, &nvfx->clip, sizeof(vp->ucp));
-		}
 	} else {
 		vp = nvfx->swtnl.vertprog;
 		constbuf = NULL;
@@ -918,7 +1086,11 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			}
 
 			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
-				assert(0);
+			{
+				debug_printf("Vertex shader too long: %u instructions\n", vplen);
+				nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
+				return FALSE;
+			}
 		}
 
 		upload_code = TRUE;
@@ -937,7 +1109,11 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			}
 
 			if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
-				assert(0);
+                        {
+                                debug_printf("Vertex shader uses too many constants: %u constants\n", vp->nr_consts);
+                                nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
+                                return FALSE;
+                        }
 		}
 
 		/*XXX: handle this some day */
@@ -952,44 +1128,57 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	 * fixup offsets and register IDs.
 	 */
 	if (vp->exec_start != vp->exec->start) {
-		for (i = 0; i < vp->nr_insns; i++) {
-			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+		//printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start);
+		for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_relocation))
+		{
+			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->branch_relocs.data + i);
+			uint32_t* hw = vp->insns[reloc->location].data;
+			unsigned target = vp->exec->start + reloc->target;
+
+			//debug_printf("vp_reloc hw %u -> hw %u\n", reloc->location, target);
 
-			if (vpi->has_branch_offset) {
-				assert(0);
+			if(!nvfx->is_nv4x)
+			{
+				hw[2] &=~ NV30_VP_INST_IADDR_MASK;
+				hw[2] |= (target & 0x1ff) << NV30_VP_INST_IADDR_SHIFT;
+			}
+			else
+			{
+				hw[3] &=~ NV40_VP_INST_IADDRL_MASK;
+				hw[3] |= (target & 7) << NV40_VP_INST_IADDRL_SHIFT;
+
+				hw[2] &=~ NV40_VP_INST_IADDRH_MASK;
+				hw[2] |= ((target >> 3) & 0x3f) << NV40_VP_INST_IADDRH_SHIFT;
 			}
 		}
 
 		vp->exec_start = vp->exec->start;
 	}
 
-	if (vp->nr_consts && vp->data_start != vp->data->start) {
-		for (i = 0; i < vp->nr_insns; i++) {
-			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+	if (vp->data_start != vp->data->start) {
+		for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation))
+		{
+			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location];
 
-			if (vpi->const_index >= 0) {
-				vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
-				vpi->data[1] |=
-					(vpi->const_index + vp->data->start) <<
+			vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
+			vpi->data[1] |=
+					(reloc->target + vp->data->start) <<
 					NVFX_VP(INST_CONST_SRC_SHIFT);
-
-			}
 		}
 
 		vp->data_start = vp->data->start;
+		upload_code = TRUE;
 	}
 
 	/* Update + Upload constant values */
 	if (vp->nr_consts) {
 		float *map = NULL;
 
-		if (constbuf) {
-			map = pipe_buffer_map(pipe, constbuf,
-					      PIPE_TRANSFER_READ,
-					      &transfer);
-		}
+		if (constbuf)
+			map = (float*)nvfx_buffer(constbuf)->data;
 
-		for (i = 0; i < vp->nr_consts; i++) {
+		for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
 			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
 
 			if (vpd->index >= 0) {
@@ -1005,41 +1194,28 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			OUT_RING  (chan, i + vp->data->start);
 			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
-
-		if (constbuf)
-			pipe_buffer_unmap(pipe, constbuf, transfer);
 	}
 
 	/* Upload vtxprog */
 	if (upload_code) {
-#if 0
-		for (i = 0; i < vp->nr_insns; i++) {
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
-		}
-#endif
 		BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
 		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
 			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
 			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
+		vp->clip_nr = -1;
 	}
 
-	if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
+	if(nvfx->dirty & (NVFX_NEW_VERTPROG))
 	{
-		WAIT_RING(chan, 7);
+		WAIT_RING(chan, 6);
 		OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
 		OUT_RING(chan, vp->exec->start);
 		if(nvfx->is_nv4x) {
-			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2));
+			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
 			OUT_RING(chan, vp->ir);
-			OUT_RING(chan, vp->or);
 		}
-		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
-		OUT_RING(chan, vp->clip_ctrl);
 	}
 
 	return TRUE;
@@ -1048,25 +1224,63 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 void
 nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
 {
-	vp->translated = FALSE;
-
-	if (vp->nr_insns) {
+	if (vp->nr_insns)
 		FREE(vp->insns);
-		vp->insns = NULL;
-		vp->nr_insns = 0;
-	}
 
-	if (vp->nr_consts) {
+	if (vp->nr_consts)
 		FREE(vp->consts);
-		vp->consts = NULL;
-		vp->nr_consts = 0;
-	}
 
 	nouveau_resource_free(&vp->exec);
-	vp->exec_start = 0;
 	nouveau_resource_free(&vp->data);
-	vp->data_start = 0;
-	vp->data_start_min = 0;
 
-	vp->ir = vp->or = vp->clip_ctrl = 0;
+	util_dynarray_fini(&vp->branch_relocs);
+	util_dynarray_fini(&vp->const_relocs);
+}
+
+static void *
+nvfx_vp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+        struct nvfx_vertex_program *vp;
+
+        // TODO: use a 64-bit atomic here!
+        static unsigned long long id = 0;
+
+        vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
+        vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+        vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
+        vp->id = ++id;
+
+        return (void *)vp;
+}
+
+static void
+nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+
+        nvfx->vertprog = hwcso;
+        nvfx->dirty |= NVFX_NEW_VERTPROG;
+        nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
+}
+
+static void
+nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+        struct nvfx_vertex_program *vp = hwcso;
+
+        draw_delete_vertex_shader(nvfx->draw, vp->draw);
+        nvfx_vertprog_destroy(nvfx, vp);
+        FREE((void*)vp->pipe.tokens);
+        FREE(vp);
+}
+
+void
+nvfx_init_vertprog_functions(struct nvfx_context *nvfx)
+{
+        nvfx->pipe.create_vs_state = nvfx_vp_state_create;
+        nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
+        nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
 }