37 files changed, 8154 insertions, 0 deletions
diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
new file mode 100644
index 0000000000..c1d57ca396
--- /dev/null
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nvfx
+
+C_SOURCES = \
+	nv04_surface_2d.c \
+	nvfx_buffer.c \
+	nvfx_context.c \
+	nvfx_clear.c \
+	nvfx_draw.c \
+	nvfx_fragprog.c \
+	nvfx_fragtex.c \
+	nv30_fragtex.c \
+	nv40_fragtex.c \
+	nvfx_miptree.c \
+	nvfx_query.c \
+	nvfx_resource.c \
+	nvfx_screen.c \
+	nvfx_state.c \
+	nvfx_state_blend.c \
+        nvfx_state_emit.c \
+	nvfx_state_fb.c \
+	nvfx_state_rasterizer.c \
+	nvfx_state_scissor.c \
+        nvfx_state_stipple.c \
+	nvfx_state_viewport.c \
+	nvfx_state_zsa.c \
+	nvfx_surface.c \
+	nvfx_transfer.c \
+	nvfx_vbo.c \
+	nvfx_vertprog.c
+
+LIBRARY_INCLUDES = \
+	-I$(TOP)/src/gallium/drivers/nouveau/include
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.c b/src/gallium/drivers/nvfx/nv04_surface_2d.c
new file mode 100644
index 0000000000..4ed574227d
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_surface_2d.c
@@ -0,0 +1,535 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_util.h"
+#include "nouveau/nouveau_screen.h"
+#include "nv04_surface_2d.h"
+
+static INLINE int
+nv04_surface_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_rect_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
+	case PIPE_FORMAT_B5G5R5A1_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	default:
+		return -1;
+	}
+}
+
+static INLINE unsigned
+nv04_swizzle_bits_square(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+	             (x & 0x002) << 1 |
+	             (x & 0x004) << 2 |
+	             (x & 0x008) << 3 |
+	             (x & 0x010) << 4 |
+	             (x & 0x020) << 5 |
+	             (x & 0x040) << 6 |
+	             (x & 0x080) << 7 |
+	             (x & 0x100) << 8 |
+	             (x & 0x200) << 9 |
+	             (x & 0x400) << 10 |
+	             (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+	             (y & 0x002) << 2 |
+	             (y & 0x004) << 3 |
+	             (y & 0x008) << 4 |
+	             (y & 0x010) << 5 |
+	             (y & 0x020) << 6 |
+	             (y & 0x040) << 7 |
+	             (y & 0x080) << 8 |
+	             (y & 0x100) << 9 |
+	             (y & 0x200) << 10 |
+	             (y & 0x400) << 11 |
+	             (y & 0x800) << 12;
+	return v | u;
+}
+
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	unsigned s = MIN2(w, h);
+	unsigned m = s - 1;
+	return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+}
+
+static int
+nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
+			  struct pipe_surface *dst, int dx, int dy,
+			  struct pipe_surface *src, int sx, int sy,
+			  int w, int h)
+{
+	struct nouveau_channel *chan = ctx->swzsurf->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	struct nouveau_bo *src_bo = ctx->buf(src);
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	const unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+        /* Max width & height may not be the same on all HW, but must be POT */
+	const unsigned max_w = 1024;
+	const unsigned max_h = 1024;
+	unsigned sub_w = w > max_w ? max_w : w;
+	unsigned sub_h = h > max_h ? max_h : h;
+	unsigned x;
+	unsigned y;
+
+        /* Swizzled surfaces must be POT  */
+	assert(util_is_pot(dst->width) && util_is_pot(dst->height));
+
+        /* If area is too large to copy in one shot we must copy it in POT chunks to meet alignment requirements */
+	assert(sub_w == w || util_is_pot(sub_w));
+	assert(sub_h == h || util_is_pot(sub_h));
+
+	MARK_RING (chan, 8 + ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*17, 2 +
+			 ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*2);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst_bo,
+	                 NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, nv04_surface_format(dst->format) |
+	                 log2i(dst->width) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+	                 log2i(dst->height) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+
+	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	for (y = 0; y < h; y += sub_h) {
+	  sub_h = MIN2(sub_h, h - y);
+
+	  for (x = 0; x < w; x += sub_w) {
+	    sub_w = MIN2(sub_w, w - x);
+
+	    assert(!(dst->offset & 63));
+
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+	    OUT_RELOCl(chan, dst_bo, dst->offset,
+                             NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
+	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, src_pitch |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src->texture->format),
+                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
+		       struct pipe_surface *dst, int dx, int dy,
+		       struct pipe_surface *src, int sx, int sy, int w, int h)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	struct nouveau_bo *src_bo = ctx->buf(src);
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	unsigned dst_offset = dst->offset + dy * dst_pitch +
+	                      dx * util_format_get_blocksize(dst->texture->format);
+	unsigned src_offset = src->offset + sy * src_pitch +
+	                      sx * util_format_get_blocksize(src->texture->format);
+
+	MARK_RING (chan, 3 + ((h / 2047) + 1) * 9, 2 + ((h / 2047) + 1) * 2);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, src_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	while (h) {
+		int count = (h > 2047) ? 2047 : h;
+
+		BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+		OUT_RELOCl(chan, src_bo, src_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCl(chan, dst_bo, dst_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+		OUT_RING  (chan, src_pitch);
+		OUT_RING  (chan, dst_pitch);
+		OUT_RING  (chan, w * util_format_get_blocksize(src->texture->format));
+		OUT_RING  (chan, count);
+		OUT_RING  (chan, 0x0101);
+		OUT_RING  (chan, 0);
+
+		h -= count;
+		src_offset += src_pitch * count;
+		dst_offset += dst_pitch * count;
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		       int w, int h)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+	struct nouveau_bo *src_bo = ctx->buf(src);
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	int format;
+
+	format = nv04_surface_format(dst->format);
+	if (format < 0)
+		return 1;
+
+	MARK_RING (chan, 12, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst_pitch << 16) | src_pitch);
+	OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (sy << 16) | sx);
+	OUT_RING  (chan, (dy << 16) | dx);
+	OUT_RING  (chan, ( h << 16) |  w);
+
+	return 0;
+}
+
+static void
+nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		  int w, int h)
+{
+	int src_linear = src->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
+	int dst_linear = dst->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
+
+	assert(src->format == dst->format);
+
+	/* Setup transfer to swizzle the texture to vram if needed */
+        if (src_linear && !dst_linear && w > 1 && h > 1) {
+           nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
+           return;
+        }
+
+        /* Use M2MF instead of the blitter since it always works
+         * Any possible performance drop is likely to be not very significant
+         * and dwarfed anyway by the current buffer management problems
+         */
+        nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h);
+}
+
+static void
+nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	int cs2d_format, gdirect_format;
+
+	cs2d_format = nv04_surface_format(dst->format);
+	assert(cs2d_format >= 0);
+
+	gdirect_format = nv04_rect_format(dst->format);
+	assert(gdirect_format >= 0);
+
+	MARK_RING (chan, 16, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst_pitch << 16) | dst_pitch);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect,
+		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dx << 16) | dy);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
+{
+	struct nv04_surface_2d *ctx;
+
+	if (!pctx || !*pctx)
+		return;
+	ctx = *pctx;
+	*pctx = NULL;
+
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	FREE(ctx);
+}
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_screen *screen)
+{
+	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
+	struct nouveau_channel *chan = screen->channel;
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x30:
+		class = NV30_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	default:
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ctx->copy = nv04_surface_copy;
+	ctx->fill = nv04_surface_fill;
+	return ctx;
+}
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen,
+			     struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
+{
+	struct pipe_resource templ;
+	struct pipe_resource* temp_tex;
+	struct nv04_surface* temp_ns;
+	int temp_flags;
+
+	temp_flags = (ns->base.usage |
+		      PIPE_BIND_BLIT_SOURCE |
+		      PIPE_BIND_BLIT_DESTINATION);
+
+	ns->base.usage = (PIPE_BIND_BLIT_SOURCE |
+			 PIPE_BIND_BLIT_DESTINATION);
+
+	memset(&templ, 0, sizeof(templ));
+	templ.format = ns->base.texture->format;
+	templ.target = PIPE_TEXTURE_2D;
+	templ.width0 = ns->base.width;
+	templ.height0 = ns->base.height;
+	templ.depth0 = 1;
+	templ.last_level = 0;
+
+	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
+	templ.nr_samples = ns->base.texture->nr_samples;
+
+	templ.bind = ns->base.texture->bind | PIPE_BIND_RENDER_TARGET;
+
+	temp_tex = pscreen->resource_create(pscreen, &templ);
+	temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
+	temp_ns->backing = ns;
+
+	if(ns->base.usage & PIPE_BIND_BLIT_SOURCE)
+		eng2d->copy(eng2d, &temp_ns->backing->base,
+			    0, 0, &ns->base,
+			    0, 0, ns->base.width, ns->base.height);
+
+	return temp_ns;
+}
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.h b/src/gallium/drivers/nvfx/nv04_surface_2d.h
new file mode 100644
index 0000000000..2123c3ed08
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_surface_2d.h
@@ -0,0 +1,43 @@
+#ifndef __NV04_SURFACE_2D_H__
+#define __NV04_SURFACE_2D_H__
+
+#include "pipe/p_state.h"
+
+struct nouveau_screen;
+
+struct nv04_surface {
+	struct pipe_surface base;
+	unsigned pitch;
+	struct nv04_surface* backing;
+};
+
+struct nv04_surface_2d {
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *blit;
+	struct nouveau_grobj *sifm;
+
+	struct nouveau_bo *(*buf)(struct pipe_surface *);
+
+	void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+	void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, int w, int h, unsigned value);
+};
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_screen *screen);
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **);
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
+
+#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
new file mode 100644
index 0000000000..dec073ac90
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -0,0 +1,149 @@
+#include "util/u_format.h"
+
+#include "nvfx_context.h"
+#include "nouveau/nouveau_util.h"
+#include "nvfx_tex.h"
+#include "nvfx_resource.h"
+
+void
+nv30_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso)
+{
+	if (cso->max_anisotropy >= 8) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+	} else
+	if (cso->max_anisotropy >= 4) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+	} else
+	if (cso->max_anisotropy >= 2) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
+	}
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
+	_(Z16_UNORM     , R5G6B5  ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(S8_USCALED_Z24_UNORM   , A8R8G8B8,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
+	return NULL;
+}
+
+
+void
+nv30_fragtex_set(struct nvfx_context *nvfx, int unit)
+{
+	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
+	struct nvfx_miptree *nv30mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
+	struct pipe_resource *pt = &nv30mt->base.base;
+	struct nouveau_bo *bo = nv30mt->base.bo;
+	struct nv30_texture_format *tf;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	uint32_t txf, txs;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf)
+		return;
+
+	txf  = tf->format;
+	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= log2i(pt->width0) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= log2i(pt->height0) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= log2i(pt->depth0) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	txs = tf->swizzle;
+
+	MARK_RING(chan, 9, 2);
+	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
+	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
+		      NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	OUT_RING(chan, ps->wrap);
+	OUT_RING(chan, NV34TCL_TX_ENABLE_ENABLE | ps->en);
+	OUT_RING(chan, txs);
+	OUT_RING(chan, ps->filt | 0x2000 /*voodoo*/);
+	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height0);
+	OUT_RING(chan, ps->bcol);
+
+	nvfx->hw_txf[unit] = txf;
+	nvfx->hw_samplers |= (1 << unit);
+}
diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h
new file mode 100644
index 0000000000..ec0444c07f
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv30_vertprog.h
@@ -0,0 +1,169 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+#include "nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv40_fragtex.c b/src/gallium/drivers/nvfx/nv40_fragtex.c
new file mode 100644
index 0000000000..0068b1ba54
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv40_fragtex.c
@@ -0,0 +1,176 @@
+#include "util/u_format.h"
+#include "nvfx_context.h"
+#include "nvfx_tex.h"
+#include "nvfx_resource.h"
+
+void
+nv40_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso)
+{
+	if (cso->max_anisotropy >= 2) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |         \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |         \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |         \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV34TCL_TX_FILTER_SIGNED_RED*sx) | (NV34TCL_TX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV34TCL_TX_FILTER_SIGNED_BLUE*sz) | (NV34TCL_TX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(S8_USCALED_Z24_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
+	return NULL;
+}
+
+
+void
+nv40_fragtex_set(struct nvfx_context *nvfx, int unit)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
+	struct nvfx_miptree *nv40mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
+	struct nouveau_bo *bo = nv40mt->base.bo;
+	struct pipe_resource *pt = &nv40mt->base.base;
+	struct nv40_texture_format *tf;
+
+	uint32_t txf, txs, txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV34TCL_TX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	MARK_RING(chan, 11 + 2 * !unit, 2);
+	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
+	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
+			NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	OUT_RING(chan, ps->wrap);
+	OUT_RING(chan, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	OUT_RING(chan, txs);
+	OUT_RING(chan, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0);
+	OUT_RING(chan, ps->bcol);
+	OUT_RING(chan, RING_3D(NV40TCL_TEX_SIZE1(unit), 1));
+	OUT_RING(chan, (pt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	nvfx->hw_txf[unit] = txf;
+	nvfx->hw_samplers |= (1 << unit);
+}
diff --git a/src/gallium/drivers/nvfx/nv40_vertprog.h b/src/gallium/drivers/nvfx/nv40_vertprog.h
new file mode 100644
index 0000000000..7337293bab
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv40_vertprog.h
@@ -0,0 +1,177 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+#include "nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_buffer.c b/src/gallium/drivers/nvfx/nvfx_buffer.c
new file mode 100644
index 0000000000..24e0a0c7f6
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_buffer.c
@@ -0,0 +1,153 @@
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "nouveau/nouveau_screen.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nvfx_resource.h"
+
+
+/* Currently using separate implementations for buffers and textures,
+ * even though gallium has a unified abstraction of these objects.
+ * Eventually these should be combined, and mechanisms like transfers
+ * be adapted to work for both buffer and texture uploads.
+ */
+static void nvfx_buffer_destroy(struct pipe_screen *pscreen,
+				struct pipe_resource *presource)
+{
+	struct nvfx_resource *buffer = nvfx_resource(presource);
+
+	nouveau_screen_bo_release(pscreen, buffer->bo);
+	FREE(buffer);
+}
+
+
+
+
+/* Utility functions for transfer create/destroy are hooked in and
+ * just record the arguments to those functions.
+ */
+static void *
+nvfx_buffer_transfer_map( struct pipe_context *pipe,
+			  struct pipe_transfer *transfer )
+{
+	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
+	uint8_t *map;
+
+	map = nouveau_screen_bo_map_range( pipe->screen,
+					   buffer->bo,
+					   transfer->box.x,
+					   transfer->box.width,
+					   nouveau_screen_transfer_flags(transfer->usage) );
+	if (map == NULL)
+		return NULL;
+	
+	return map + transfer->box.x;
+}
+
+
+
+static void nvfx_buffer_transfer_flush_region( struct pipe_context *pipe,
+					       struct pipe_transfer *transfer,
+					       const struct pipe_box *box)
+{
+	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
+
+	nouveau_screen_bo_map_flush_range(pipe->screen,
+					  buffer->bo,
+					  transfer->box.x + box->x,
+					  box->width);
+}
+
+static void nvfx_buffer_transfer_unmap( struct pipe_context *pipe,
+					struct pipe_transfer *transfer )
+{
+	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
+
+	nouveau_screen_bo_unmap(pipe->screen, buffer->bo);
+}
+
+
+
+
+struct u_resource_vtbl nvfx_buffer_vtbl = 
+{
+	u_default_resource_get_handle,      /* get_handle */
+	nvfx_buffer_destroy,		     /* resource_destroy */
+	NULL,			    /* is_resource_referenced */
+	u_default_get_transfer,	     /* get_transfer */
+	u_default_transfer_destroy,	     /* transfer_destroy */
+	nvfx_buffer_transfer_map,	     /* transfer_map */
+	nvfx_buffer_transfer_flush_region,  /* transfer_flush_region */
+	nvfx_buffer_transfer_unmap,	     /* transfer_unmap */
+	u_default_transfer_inline_write   /* transfer_inline_write */
+};
+
+
+
+struct pipe_resource *
+nvfx_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template)
+{
+	struct nvfx_resource *buffer;
+
+	buffer = CALLOC_STRUCT(nvfx_resource);
+	if (!buffer)
+		return NULL;
+
+	buffer->base = *template;
+	buffer->vtbl = &nvfx_buffer_vtbl;
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->base.screen = pscreen;
+
+	buffer->bo = nouveau_screen_bo_new(pscreen,
+					   16,
+					   buffer->base._usage,
+					   buffer->base.bind,
+					   buffer->base.width0);
+
+	if (buffer->bo == NULL)
+		goto fail;
+
+	return &buffer->base;
+
+fail:
+	FREE(buffer);
+	return NULL;
+}
+
+
+struct pipe_resource *
+nvfx_user_buffer_create(struct pipe_screen *pscreen,
+			void *ptr,
+			unsigned bytes,
+			unsigned usage)
+{
+	struct nvfx_resource *buffer;
+
+	buffer = CALLOC_STRUCT(nvfx_resource);
+	if (!buffer)
+		return NULL;
+
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->vtbl = &nvfx_buffer_vtbl;
+	buffer->base.screen = pscreen;
+	buffer->base.format = PIPE_FORMAT_R8_UNORM;
+	buffer->base._usage = PIPE_USAGE_IMMUTABLE;
+	buffer->base.bind = usage;
+	buffer->base.width0 = bytes;
+	buffer->base.height0 = 1;
+	buffer->base.depth0 = 1;
+
+	buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes);
+	if (!buffer->bo)
+		goto fail;
+	
+	return &buffer->base;
+
+fail:
+	FREE(buffer);
+	return NULL;
+}
+
diff --git a/src/gallium/drivers/nvfx/nvfx_clear.c b/src/gallium/drivers/nvfx/nvfx_clear.c
new file mode 100644
index 0000000000..2be70fcee4
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_clear.c
@@ -0,0 +1,14 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_clear.h"
+
+#include "nvfx_context.h"
+
+void
+nvfx_clear(struct pipe_context *pipe, unsigned buffers,
+           const float *rgba, double depth, unsigned stencil)
+{
+	util_clear(pipe, &nvfx_context(pipe)->framebuffer, buffers, rgba, depth,
+		   stencil);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
new file mode 100644
index 0000000000..6d2dc4d5bf
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -0,0 +1,84 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+#include "nvfx_resource.h"
+
+static void
+nvfx_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
+	}
+
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
+}
+
+static void
+nvfx_destroy(struct pipe_context *pipe)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	if (nvfx->draw)
+		draw_destroy(nvfx->draw);
+	FREE(nvfx);
+}
+
+struct pipe_context *
+nvfx_create(struct pipe_screen *pscreen, void *priv)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nvfx_context *nvfx;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvfx = CALLOC(1, sizeof(struct nvfx_context));
+	if (!nvfx)
+		return NULL;
+	nvfx->screen = screen;
+
+	nvfx->nvws = nvws;
+
+	nvfx->pipe.winsys = ws;
+	nvfx->pipe.screen = pscreen;
+	nvfx->pipe.priv = priv;
+	nvfx->pipe.destroy = nvfx_destroy;
+	nvfx->pipe.draw_arrays = nvfx_draw_arrays;
+	nvfx->pipe.draw_elements = nvfx_draw_elements;
+	nvfx->pipe.clear = nvfx_clear;
+	nvfx->pipe.flush = nvfx_flush;
+
+	screen->base.channel->user_private = nvfx;
+
+	nvfx->is_nv4x = screen->is_nv4x;
+
+	nvfx_init_query_functions(nvfx);
+	nvfx_init_surface_functions(nvfx);
+	nvfx_init_state_functions(nvfx);
+	nvfx_init_resource_functions(&nvfx->pipe);
+
+	/* Create, configure, and install fallback swtnl path */
+	nvfx->draw = draw_create(&nvfx->pipe);
+	draw_wide_point_threshold(nvfx->draw, 9999999.0);
+	draw_wide_line_threshold(nvfx->draw, 9999999.0);
+	draw_enable_line_stipple(nvfx->draw, FALSE);
+	draw_enable_point_sprites(nvfx->draw, FALSE);
+	draw_set_rasterize_stage(nvfx->draw, nvfx_draw_render_stage(nvfx));
+
+	/* set these to that we init them on first validation */
+	nvfx->state.scissor_enabled = ~0;
+	nvfx->state.stipple_enabled = ~0;
+	return &nvfx->pipe;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
new file mode 100644
index 0000000000..e2c6d09fa1
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -0,0 +1,250 @@
+#ifndef __NVFX_CONTEXT_H__
+#define __NVFX_CONTEXT_H__
+
+#include <stdio.h>
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#include "nvfx_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#include "nvfx_screen.h"
+
+#define NVFX_NEW_BLEND		(1 <<  0)
+#define NVFX_NEW_RAST		(1 <<  1)
+#define NVFX_NEW_ZSA		(1 <<  2)
+#define NVFX_NEW_SAMPLER	(1 <<  3)
+#define NVFX_NEW_FB		(1 <<  4)
+#define NVFX_NEW_STIPPLE	(1 <<  5)
+#define NVFX_NEW_SCISSOR	(1 <<  6)
+#define NVFX_NEW_VIEWPORT	(1 <<  7)
+#define NVFX_NEW_BCOL		(1 <<  8)
+#define NVFX_NEW_VERTPROG	(1 <<  9)
+#define NVFX_NEW_FRAGPROG	(1 << 10)
+#define NVFX_NEW_ARRAYS		(1 << 11)
+#define NVFX_NEW_UCP		(1 << 12)
+#define NVFX_NEW_SR		(1 << 13)
+#define NVFX_NEW_VERTCONST	(1 << 14)
+#define NVFX_NEW_FRAGCONST	(1 << 15)
+
+struct nvfx_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	unsigned sb_len;
+	uint32_t sb[32];
+};
+
+struct nvfx_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	unsigned sb_len;
+	uint32_t sb[26];
+};
+
+struct nvfx_blend_state {
+	struct pipe_blend_state pipe;
+	unsigned sb_len;
+	uint32_t sb[13];
+};
+
+
+struct nvfx_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned fp_samplers;
+};
+
+struct nvfx_vtxelt_state {
+	struct pipe_vertex_element pipe[16];
+	unsigned num_elements;
+};
+
+struct nvfx_render_target {
+	struct nouveau_bo* bo;
+	unsigned offset;
+	unsigned pitch;
+};
+
+struct nvfx_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nvfx_screen *screen;
+
+	unsigned is_nv4x; /* either 0 or ~0 */
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nvfx_state state;
+	struct {
+		struct nvfx_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nvfx_vertex_program *vertprog;
+	struct nvfx_fragment_program *fragprog;
+	struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nvfx_rasterizer_state *rasterizer;
+	struct nvfx_zsa_state *zsa;
+	struct nvfx_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_stencil_ref stencil_ref;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_resource *idxbuf;
+	unsigned idxbuf_format;
+	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct nvfx_vtxelt_state *vtxelt;
+
+	unsigned vbo_bo;
+	unsigned hw_vtxelt_nr;
+	uint8_t hw_samplers;
+	uint32_t hw_txf[8];
+	struct nvfx_render_target hw_rt[4];
+	struct nvfx_render_target hw_zeta;
+};
+
+static INLINE struct nvfx_context *
+nvfx_context(struct pipe_context *pipe)
+{
+	return (struct nvfx_context *)pipe;
+}
+
+extern struct nvfx_state_entry nvfx_state_blend;
+extern struct nvfx_state_entry nvfx_state_blend_colour;
+extern struct nvfx_state_entry nvfx_state_fragprog;
+extern struct nvfx_state_entry nvfx_state_fragtex;
+extern struct nvfx_state_entry nvfx_state_framebuffer;
+extern struct nvfx_state_entry nvfx_state_rasterizer;
+extern struct nvfx_state_entry nvfx_state_scissor;
+extern struct nvfx_state_entry nvfx_state_sr;
+extern struct nvfx_state_entry nvfx_state_stipple;
+extern struct nvfx_state_entry nvfx_state_vbo;
+extern struct nvfx_state_entry nvfx_state_vertprog;
+extern struct nvfx_state_entry nvfx_state_viewport;
+extern struct nvfx_state_entry nvfx_state_vtxfmt;
+extern struct nvfx_state_entry nvfx_state_zsa;
+
+extern void nvfx_init_query_functions(struct nvfx_context *nvfx);
+extern void nvfx_init_surface_functions(struct nvfx_context *nvfx);
+
+/* nvfx_context.c */
+struct pipe_context *
+nvfx_create(struct pipe_screen *pscreen, void *priv);
+
+/* nvfx_clear.c */
+extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers,
+		       const float *rgba, double depth, unsigned stencil);
+
+/* nvfx_draw.c */
+extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx);
+extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_resource *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
+extern void nvfx_vtxfmt_validate(struct nvfx_context *nvfx);
+
+/* nvfx_fb.c */
+extern void nvfx_state_framebuffer_validate(struct nvfx_context *nvfx);
+void
+nvfx_framebuffer_relocate(struct nvfx_context *nvfx);
+
+/* nvfx_fragprog.c */
+extern void nvfx_fragprog_destroy(struct nvfx_context *,
+				    struct nvfx_fragment_program *);
+extern void nvfx_fragprog_validate(struct nvfx_context *nvfx);
+extern void
+nvfx_fragprog_relocate(struct nvfx_context *nvfx);
+
+/* nvfx_fragtex.c */
+extern void nvfx_fragtex_validate(struct nvfx_context *nvfx);
+extern void
+nvfx_fragtex_relocate(struct nvfx_context *nvfx);
+
+/* nv30_fragtex.c */
+extern void
+nv30_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso);
+extern void nv30_fragtex_set(struct nvfx_context *nvfx, int unit);
+
+/* nv40_fragtex.c */
+extern void
+nv40_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso);
+extern void nv40_fragtex_set(struct nvfx_context *nvfx, int unit);
+
+/* nvfx_state.c */
+extern void nvfx_init_state_functions(struct nvfx_context *nvfx);
+extern void nvfx_state_scissor_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_stipple_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_blend_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_blend_colour_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_viewport_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_rasterizer_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_sr_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_zsa_validate(struct nvfx_context *nvfx);
+
+/* nvfx_state_emit.c */
+extern void nvfx_state_relocate(struct nvfx_context *nvfx);
+extern boolean nvfx_state_validate(struct nvfx_context *nvfx);
+extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx);
+extern void nvfx_state_emit(struct nvfx_context *nvfx);
+
+/* nvfx_transfer.c */
+extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx);
+
+/* nvfx_vbo.c */
+extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx);
+extern void nvfx_vbo_relocate(struct nvfx_context *nvfx);
+extern void nvfx_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern void nvfx_draw_elements(struct pipe_context *pipe,
+				  struct pipe_resource *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nvfx_vertprog.c */
+extern boolean nvfx_vertprog_validate(struct nvfx_context *nvfx);
+extern void nvfx_vertprog_destroy(struct nvfx_context *,
+				  struct nvfx_vertex_program *);
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
new file mode 100644
index 0000000000..5eadce1f6d
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -0,0 +1,349 @@
+#include "pipe/p_shader_tokens.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_ureg.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nvfx_context.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nvfx_render_stage {
+	struct draw_stage stage;
+	struct nvfx_context *nvfx;
+	unsigned prim;
+};
+
+static INLINE struct nvfx_render_stage *
+nvfx_render_stage(struct draw_stage *stage)
+{
+	return (struct nvfx_render_stage *)stage;
+}
+
+static INLINE void
+nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned i;
+
+	for (i = 0; i < nvfx->swtnl.nr_attribs; i++) {
+		unsigned idx = nvfx->swtnl.draw[i];
+		unsigned hw = nvfx->swtnl.hw[i];
+
+		switch (nvfx->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			OUT_RING  (chan, fui(v->data[idx][3]));
+			break;
+		case 0xff:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0] / v->data[idx][3]));
+			OUT_RING  (chan, fui(v->data[idx][1] / v->data[idx][3]));
+			OUT_RING  (chan, fui(v->data[idx][2] / v->data[idx][3]));
+			OUT_RING  (chan, fui(1.0f / v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+		case EMIT_4UB_BGRA:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nvfx_render_stage *rs = nvfx_render_stage(stage);
+	struct nvfx_context *nvfx = rs->nvfx;
+
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
+		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(chan);
+		nvfx_state_emit(nvfx);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+			OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nvfx_render_vertex(nvfx, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
+	}
+}
+
+static void
+nvfx_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_POINTS, 1);
+}
+
+static void
+nvfx_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_LINES, 2);
+}
+
+static void
+nvfx_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nvfx_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nvfx_render_stage *rs = nvfx_render_stage(draw);
+	struct nvfx_context *nvfx = rs->nvfx;
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
+	}
+}
+
+static void
+nvfx_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nvfx_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static struct nvfx_vertex_program *
+nvfx_create_drawvp(struct nvfx_context *nvfx)
+{
+	struct ureg_program *ureg;
+	uint i;
+
+	ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
+	if (ureg == NULL)
+		return NULL;
+
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0), ureg_DECL_vs_input(ureg, 3));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1), ureg_DECL_vs_input(ureg, 4));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 0), ureg_DECL_vs_input(ureg, 3));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 1), ureg_DECL_vs_input(ureg, 4));
+	ureg_MOV(ureg,
+		   ureg_writemask(ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 1), TGSI_WRITEMASK_X),
+		   ureg_DECL_vs_input(ureg, 5));
+	for (i = 0; i < 8; ++i)
+		ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, i), ureg_DECL_vs_input(ureg, 8 + i));
+
+	ureg_END( ureg );
+
+	return ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
+}
+
+struct draw_stage *
+nvfx_draw_render_stage(struct nvfx_context *nvfx)
+{
+	struct nvfx_render_stage *render = CALLOC_STRUCT(nvfx_render_stage);
+
+	if (!nvfx->swtnl.vertprog)
+		nvfx->swtnl.vertprog = nvfx_create_drawvp(nvfx);
+
+	render->nvfx = nvfx;
+	render->stage.draw = nvfx->draw;
+	render->stage.point = nvfx_render_point;
+	render->stage.line = nvfx_render_line;
+	render->stage.tri = nvfx_render_tri;
+	render->stage.flush = nvfx_render_flush;
+	render->stage.reset_stipple_counter = nvfx_render_reset_stipple_counter;
+	render->stage.destroy = nvfx_render_destroy;
+
+	return &render->stage;
+}
+
+void
+nvfx_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_resource *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
+	struct pipe_transfer *ib_transfer = NULL;
+	struct pipe_transfer *cb_transfer = NULL;
+	unsigned i;
+	void *map;
+
+	if (!nvfx_state_validate_swtnl(nvfx))
+		return;
+	nvfx_state_emit(nvfx);
+
+	for (i = 0; i < nvfx->vtxbuf_nr; i++) {
+		map = pipe_buffer_map(pipe, nvfx->vtxbuf[i].buffer,
+                                      PIPE_TRANSFER_READ,
+				      &vb_transfer[i]);
+		draw_set_mapped_vertex_buffer(nvfx->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = pipe_buffer_map(pipe, idxbuf,
+				      PIPE_TRANSFER_READ,
+				      &ib_transfer);
+		draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nvfx->draw, 0, NULL);
+	}
+
+	if (nvfx->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = pipe_buffer_map(pipe,
+				      nvfx->constbuf[PIPE_SHADER_VERTEX],
+				      PIPE_TRANSFER_READ,
+				      &cb_transfer);
+		draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0,
+                                                map, nr);
+	}
+
+	draw_arrays(nvfx->draw, mode, start, count);
+
+	for (i = 0; i < nvfx->vtxbuf_nr; i++)
+		pipe_buffer_unmap(pipe, nvfx->vtxbuf[i].buffer, vb_transfer[i]);
+
+	if (idxbuf)
+		pipe_buffer_unmap(pipe, idxbuf, ib_transfer);
+
+	if (nvfx->constbuf[PIPE_SHADER_VERTEX])
+		pipe_buffer_unmap(pipe, nvfx->constbuf[PIPE_SHADER_VERTEX],
+				  cb_transfer);
+
+	draw_flush(nvfx->draw);
+	pipe->flush(pipe, 0, NULL);
+}
+
+static INLINE void
+emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_shader_output(nvfx->draw, semantic, index);
+	unsigned a = nvfx->swtnl.nr_attribs++;
+
+	nvfx->swtnl.hw[a] = hw;
+	nvfx->swtnl.emit[a] = emit;
+	nvfx->swtnl.draw[a] = draw_out;
+}
+
+void
+nvfx_vtxfmt_validate(struct nvfx_context *nvfx)
+{
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nvfx->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nvfx, 3 + i, EMIT_4F, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nvfx, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nvfx, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nvfx, 0, 0xff, TGSI_SEMANTIC_POSITION, 0);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
new file mode 100644
index 0000000000..6772d9bd51
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -0,0 +1,1004 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nvfx_context.h"
+#include "nvfx_shader.h"
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nvfx_fpc {
+	struct nvfx_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nvfx_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nvfx_sreg
+temp(struct nvfx_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nvfx_sr(NVFXSR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nvfx_sr(NVFXSR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nvfx_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nvfx_sreg
+constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nvfx_sr(NVFXSR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nvfx_fp_arith((cc), (s), NVFX_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nvfx_fp_tex((cc), (s), NVFX_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nvfx_fpc *fpc, int size)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NVFXSR_INPUT:
+		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NVFXSR_OUTPUT:
+		sr |= NVFX_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NVFXSR_TEMP:
+		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NVFX_FP_REG_SRC_SHIFT);
+		break;
+	case NVFXSR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nvfx_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
+		break;
+	case NVFXSR_NONE:
+		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NVFX_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NVFXSR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NVFXSR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NVFXSR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op,
+	      struct nvfx_sreg dst, int mask,
+	      struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NVFX_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NVFX_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NVFX_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NVFX_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NVFX_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NVFX_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nvfx_fp_tex(struct nvfx_fpc *fpc, int sat, int op, int unit,
+	    struct nvfx_sreg dst, int mask,
+	    struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+
+	nvfx_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nvfx_sreg
+tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nvfx_sreg src = { 0 };
+
+	switch (fsrc->Register.File) {
+	case TGSI_FILE_INPUT:
+		src = nvfx_sr(NVFXSR_INPUT,
+			      fpc->attrib_map[fsrc->Register.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->Register.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->Register.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->Register.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->Register.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->Register.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->Register.Index];
+	case TGSI_FILE_NULL:
+		return nvfx_sr(NVFXSR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
+		return nvfx_sr(NVFXSR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
+	return mask;
+}
+
+static boolean
+nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	struct nvfx_sreg src[3], dst, tmp;
+	int mask, sat, unit = 0;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+
+		switch (fsrc->Register.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->Register.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = nvfx_sr(NVFXSR_NONE, 0);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NVFX_COND_GE;
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		dst.cc_test = NVFX_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nvfx_sr(NVFXSR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, NVFX_FP_MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NVFX_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, LRP_NV30, dst, mask, src[0], src[1], src[2]);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+			arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, POW_NV30, dst, mask, src[0], src[1], none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, LG2, tmp, NVFX_FP_MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+			arith(fpc, 0, MUL, tmp, NVFX_FP_MASK_X, swz(tmp, X, X, X, X),
+			      swz(src[1], X, X, X, X), none);
+			arith(fpc, sat, EX2, dst, mask,
+			      swz(tmp, X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		if(!nvfx->is_nv4x)
+			arith(fpc, 0, RFL_NV30, dst, mask, src[0], src[1], none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[0], none);
+			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_Y, src[0], src[1], none);
+			arith(fpc, 0, DIV, scale(tmp, 2X), NVFX_FP_MASK_Z,
+			      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+			arith(fpc, sat, MAD, dst, mask,
+			      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		}
+		break;
+	case TGSI_OPCODE_RSQ:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, LG2, scale(tmp, INV_2X), NVFX_FP_MASK_X,
+			      abs(swz(src[0], X, X, X, X)), none, none);
+			arith(fpc, sat, EX2, dst, mask,
+			      neg(swz(tmp, X, X, X, X)), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		/* avoid overwriting the source */
+		if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
+		{
+			if (mask & NVFX_FP_MASK_X) {
+				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & NVFX_FP_MASK_Y) {
+				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+		}
+		else
+		{
+			if (mask & NVFX_FP_MASK_Y) {
+				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & NVFX_FP_MASK_X) {
+				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~NVFX_FP_MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_parse_decl_attrib(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NVFX_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NVFX_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.Index <= 7) {
+			hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     Index);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->Range.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->Range.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		hw = ~0;
+		switch (fdec->Semantic.Index) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		}
+		if(hw > ((nvfx->is_nv4x) ? 4 : 2)) {
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nvfx_fragprog_parse_decl_attrib(nvfx, fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->Range.Last > high_temp) {
+					high_temp =
+						fdec->Range.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u[0].Float;
+			vals[1] = imm->u[1].Float;
+			vals[2] = imm->u[2].Float;
+			vals[3] = imm->u[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nvfx_fragprog_translate(struct nvfx_context *nvfx,
+			struct nvfx_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nvfx_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nvfx_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nvfx_fragprog_prepare(nvfx, fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	if(!nvfx->is_nv4x)
+		fp->fp_control |= (fpc->num_regs-1)/2;
+	else
+		fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	if(fp->insn)
+                fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static inline void
+nvfx_fp_memcpy(void* dst, const void* src, size_t len)
+{
+#ifndef WORDS_BIGENDIAN
+	memcpy(dst, src, len);
+#else
+	size_t i;
+	for(i = 0; i < len; i += 4) {
+		uint32_t v = (uint32_t*)((char*)src + i);
+		*(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16);
+	}
+#endif
+}
+
+void
+nvfx_fragprog_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	int update = 0;
+	int i;
+
+	if (!fp->translated)
+	{
+		const int min_size = 4096;
+
+		nvfx_fragprog_translate(nvfx, fp);
+		if (!fp->translated) {
+			static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0};
+			static int warned = 0;
+			if(!warned)
+			{
+				fprintf(stderr, "nvfx: failed to translate fragment program!\n");
+				warned = 1;
+			}
+
+			/* use dummy program: we cannot fail here */
+			fp->translated = TRUE;
+			fp->insn = malloc(sizeof(dummy));
+			memcpy(fp->insn, dummy, sizeof(dummy));
+			fp->insn_len = sizeof(dummy) / sizeof(dummy[0]);
+		}
+		update = TRUE;
+
+		fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+
+		if(fp->prog_size >= min_size)
+			fp->progs_per_bo = 1;
+		else
+			fp->progs_per_bo = min_size / fp->prog_size;
+		fp->bo_prog_idx = fp->progs_per_bo - 1;
+	}
+
+	/* we must update constants even on "just" fragprog changes, because
+	   we don't check whether the current constant buffer matches the latest
+	   one bound to this fragment program */
+	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG))
+		update = TRUE;
+
+	if(update) {
+		int offset;
+
+		++fp->bo_prog_idx;
+		if(fp->bo_prog_idx >= fp->progs_per_bo)
+		{
+			if(fp->fpbo && !nouveau_bo_busy(fp->fpbo->next->bo, NOUVEAU_BO_WR))
+			{
+				fp->fpbo = fp->fpbo->next;
+			}
+			else
+			{
+				struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16);
+				char *map, *buf;
+
+				if(fp->fpbo)
+				{
+					fpbo->next = fp->fpbo->next;
+					fp->fpbo->next = fpbo;
+				}
+				else
+					fpbo->next = fpbo;
+				fp->fpbo = fpbo;
+				fpbo->bo = 0;
+				nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo);
+				nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC);
+
+				map = fpbo->bo->map;
+				buf = fpbo->insn;
+				for(int i = 0; i < fp->progs_per_bo; ++i)
+				{
+					memcpy(buf, fp->insn, fp->insn_len * 4);
+					nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4);
+					map += fp->prog_size;
+					buf += fp->prog_size;
+				}
+			}
+			fp->bo_prog_idx = 0;
+		}
+
+		offset = fp->bo_prog_idx * fp->prog_size;
+
+		if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
+			struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
+			// TODO: avoid using transfers, just directly the buffer
+			struct pipe_transfer* transfer;
+			// TODO: does this check make any sense, or should we do this unconditionally?
+			uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer);
+			uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
+			uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
+			for (i = 0; i < fp->nr_consts; ++i) {
+				unsigned off = fp->consts[i].offset;
+				unsigned idx = fp->consts[i].index * 4;
+
+				/* TODO: is checking a good idea? */
+				if(memcmp(&buf[off], &map[idx], 4 * sizeof(uint32_t))) {
+					memcpy(&buf[off], &map[idx], 4 * sizeof(uint32_t));
+					nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
+				}
+			}
+			pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
+		}
+	}
+
+	if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
+		int offset = fp->bo_prog_idx * fp->prog_size;
+		MARK_RING(chan, 8, 1);
+		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
+		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+			      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+			      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+		OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1));
+		OUT_RING(chan, fp->fp_control);
+		if(!nvfx->is_nv4x) {
+			OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1));
+			OUT_RING(chan, (1<<16)|0x4);
+			OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1));
+			OUT_RING(chan, fp->samplers);
+		}
+	}
+}
+
+void
+nvfx_fragprog_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nouveau_bo* bo = fp->fpbo->bo;
+	int offset = fp->bo_prog_idx * fp->prog_size;
+	unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART?
+	fp_flags |= NOUVEAU_BO_DUMMY;
+	MARK_RING(chan, 2, 2);
+	OUT_RELOC(chan, bo, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1), fp_flags, 0, 0);
+	OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW |
+		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+}
+
+void
+nvfx_fragprog_destroy(struct nvfx_context *nvfx,
+		      struct nvfx_fragment_program *fp)
+{
+	struct nvfx_fragment_program_bo* fpbo = fp->fpbo;
+	if(fpbo)
+	{
+		do
+		{
+			struct nvfx_fragment_program_bo* next = fpbo->next;
+			nouveau_bo_unmap(fpbo->bo);
+			nouveau_bo_ref(0, &fpbo->bo);
+			free(fpbo);
+			fpbo = next;
+		}
+		while(fpbo != fp->fpbo);
+	}
+
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
diff --git a/src/gallium/drivers/nvfx/nvfx_fragtex.c b/src/gallium/drivers/nvfx/nvfx_fragtex.c
new file mode 100644
index 0000000000..0b4a434fec
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_fragtex.c
@@ -0,0 +1,58 @@
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+
+void
+nvfx_fragtex_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned samplers, unit;
+
+	samplers = nvfx->dirty_samplers;
+	if(!samplers)
+		return;
+
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		if(nvfx->fragment_sampler_views[unit] && nvfx->tex_sampler[unit]) {
+			if(!nvfx->is_nv4x)
+				nv30_fragtex_set(nvfx, unit);
+			else
+				nv40_fragtex_set(nvfx, unit);
+		} else {
+			WAIT_RING(chan, 2);
+			/* this is OK for nv40 too */
+			OUT_RING(chan, RING_3D(NV34TCL_TX_ENABLE(unit), 1));
+			OUT_RING(chan, 0);
+			nvfx->hw_samplers &= ~(1 << unit);
+		}
+	}
+	nvfx->dirty_samplers = 0;
+}
+
+void
+nvfx_fragtex_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned samplers, unit;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	samplers = nvfx->hw_samplers;
+	while (samplers) {
+		struct nvfx_miptree* mt;
+		struct nouveau_bo *bo;
+
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		mt = (struct nvfx_miptree*)nvfx->fragment_sampler_views[unit]->texture;
+		bo = mt->base.bo;
+
+		MARK_RING(chan, 3, 3);
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_TX_OFFSET(unit), 2), tex_flags | NOUVEAU_BO_DUMMY, 0, 0);
+		OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_DUMMY, 0, 0);
+		OUT_RELOC(chan, bo, nvfx->hw_txf[unit], tex_flags | NOUVEAU_BO_OR | NOUVEAU_BO_DUMMY,
+				NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_miptree.c b/src/gallium/drivers/nvfx/nvfx_miptree.c
new file mode 100644
index 0000000000..97b2e5e8b6
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_miptree.c
@@ -0,0 +1,310 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+#include "nvfx_transfer.h"
+#include "nv04_surface_2d.h"
+
+/* Currently using separate implementations for buffers and textures,
+ * even though gallium has a unified abstraction of these objects.
+ * Eventually these should be combined, and mechanisms like transfers
+ * be adapted to work for both buffer and texture uploads.
+ */
+
+static void
+nvfx_miptree_layout(struct nvfx_miptree *mt)
+{
+	struct pipe_resource *pt = &mt->base.base;
+	uint width = pt->width0;
+	uint offset = 0;
+	int nr_faces, l, f;
+	uint wide_pitch = pt->bind & (PIPE_BIND_SAMPLER_VIEW |
+				      PIPE_BIND_DEPTH_STENCIL |
+				      PIPE_BIND_RENDER_TARGET |
+				      PIPE_BIND_DISPLAY_TARGET |
+				      PIPE_BIND_SCANOUT);
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth0;
+	} else {
+		nr_faces = 1;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		if (wide_pitch && (pt->flags & NVFX_RESOURCE_FLAG_LINEAR))
+			mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
+		else
+			mt->level[l].pitch = util_format_get_stride(pt->format, width);
+
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = u_minify(width, 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l < pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+
+			if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR) &&
+			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
+				offset += align(mt->level[l].pitch * u_minify(pt->height0, l), 64);
+			else
+				offset += mt->level[l].pitch * u_minify(pt->height0, l);
+		}
+
+		mt->level[l].image_offset[f] = offset;
+		offset += mt->level[l].pitch * u_minify(pt->height0, l);
+	}
+
+	mt->total_size = offset;
+}
+
+static boolean
+nvfx_miptree_get_handle(struct pipe_screen *pscreen,
+			struct pipe_resource *ptexture,
+			struct winsys_handle *whandle)
+{
+	struct nvfx_miptree* mt = (struct nvfx_miptree*)ptexture;
+
+	if (!mt || !mt->base.bo)
+		return FALSE;
+
+	return nouveau_screen_bo_get_handle(pscreen,
+					    mt->base.bo,
+					    mt->level[0].pitch,
+					    whandle);
+}
+
+
+static void
+nvfx_miptree_destroy(struct pipe_screen *screen, struct pipe_resource *pt)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	int l;
+
+	nouveau_screen_bo_release(screen, mt->base.bo);
+
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	FREE(mt);
+}
+
+
+
+
+struct u_resource_vtbl nvfx_miptree_vtbl = 
+{
+   nvfx_miptree_get_handle,	      /* get_handle */
+   nvfx_miptree_destroy,	      /* resource_destroy */
+   NULL,			      /* is_resource_referenced */
+   nvfx_miptree_transfer_new,	      /* get_transfer */
+   nvfx_miptree_transfer_del,     /* transfer_destroy */
+   nvfx_miptree_transfer_map,	      /* transfer_map */
+   u_default_transfer_flush_region,   /* transfer_flush_region */
+   nvfx_miptree_transfer_unmap,	      /* transfer_unmap */
+   u_default_transfer_inline_write    /* transfer_inline_write */
+};
+
+
+
+struct pipe_resource *
+nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt)
+{
+	struct nvfx_miptree *mt;
+	static int no_swizzle = -1;
+	if(no_swizzle < 0)
+		no_swizzle = debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE);
+
+	mt = CALLOC_STRUCT(nvfx_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base.base = *pt;
+	mt->base.vtbl = &nvfx_miptree_vtbl;
+	pipe_reference_init(&mt->base.base.reference, 1);
+	mt->base.base.screen = pscreen;
+
+	/* Swizzled textures must be POT */
+	if (pt->width0 & (pt->width0 - 1) ||
+	    pt->height0 & (pt->height0 - 1))
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	else
+	if (pt->bind & (PIPE_BIND_SCANOUT |
+			PIPE_BIND_DISPLAY_TARGET |
+			PIPE_BIND_DEPTH_STENCIL))
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	else
+	if (pt->_usage == PIPE_USAGE_DYNAMIC)
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	else {
+		switch (pt->format) {
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_L8A8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			/* TODO: we can actually swizzle these formats on nv40, we
+				are just preserving the pre-unification behavior.
+				The whole 2D code is going to be rewritten anyway. */
+			if(nvfx_screen(pscreen)->is_nv4x) {
+				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+				break;
+			}
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		{
+			if (no_swizzle)
+				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+			break;
+		}
+		default:
+			mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+		}
+	}
+
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->bind & PIPE_BIND_RENDER_TARGET &&
+	    util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+
+	nvfx_miptree_layout(mt);
+
+	mt->base.bo = nouveau_screen_bo_new(pscreen, 256,
+            pt->_usage, pt->bind, mt->total_size);
+	if (!mt->base.bo) {
+		FREE(mt);
+		return NULL;
+	}
+	return &mt->base.base;
+}
+
+
+
+
+struct pipe_resource *
+nvfx_miptree_from_handle(struct pipe_screen *pscreen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle)
+{
+	struct nvfx_miptree *mt;
+	unsigned stride;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (template->target != PIPE_TEXTURE_2D ||
+	    template->last_level != 0 ||
+	    template->depth0 != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nvfx_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
+	if (mt->base.bo == NULL) {
+		FREE(mt);
+		return NULL;
+	}
+
+	mt->base.base = *template;
+	mt->base.vtbl = &nvfx_miptree_vtbl;
+	pipe_reference_init(&mt->base.base.reference, 1);
+	mt->base.base.screen = pscreen;
+	mt->level[0].pitch = stride;
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	/* Assume whoever created this buffer expects it to be linear for now */
+	mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+
+	/* XXX: Need to adjust bo refcount??
+	 */
+	/* nouveau_bo_ref(bo, &mt->base.bo); */
+	return &mt->base.base;
+}
+
+
+
+
+
+/* Surface helpers, not strictly required to implement the resource vtbl:
+ */
+struct pipe_surface *
+nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	struct nv04_surface *ns;
+
+	ns = CALLOC_STRUCT(nv04_surface);
+	if (!ns)
+		return NULL;
+	pipe_resource_reference(&ns->base.texture, pt);
+	ns->base.format = pt->format;
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
+	ns->base.usage = flags;
+	pipe_reference_init(&ns->base.reference, 1);
+	ns->base.face = face;
+	ns->base.level = level;
+	ns->base.zslice = zslice;
+	ns->pitch = mt->level[level].pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ns->base.offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ns->base.offset = mt->level[level].image_offset[zslice];
+	} else {
+		ns->base.offset = mt->level[level].image_offset[0];
+	}
+
+	/* create a linear temporary that we can render into if
+	 * necessary.
+	 *
+	 * Note that ns->pitch is always a multiple of 64 for linear
+	 * surfaces and swizzled surfaces are POT, so ns->pitch & 63
+	 * is equivalent to (ns->pitch < 64 && swizzled)
+	 */
+
+	if ((ns->pitch & 63) && 
+	    (ns->base.usage & PIPE_BIND_RENDER_TARGET))
+	{
+		struct nv04_surface_2d* eng2d  =
+			((struct nvfx_screen*)pscreen)->eng2d;
+
+		ns = nv04_surface_wrap_for_render(pscreen, eng2d, ns);
+	}
+
+	return &ns->base;
+}
+
+void
+nvfx_miptree_surface_del(struct pipe_surface *ps)
+{
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nvfx_screen* screen = (struct nvfx_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BIND_BLIT_DESTINATION)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nvfx_miptree_surface_del(&ns->backing->base);
+	}
+
+	pipe_resource_reference(&ps->texture, NULL);
+	FREE(ps);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_query.c b/src/gallium/drivers/nvfx/nvfx_query.c
new file mode 100644
index 0000000000..1b20b5245d
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_query.c
@@ -0,0 +1,137 @@
+#include "pipe/p_context.h"
+
+#include "nvfx_context.h"
+
+struct nvfx_query {
+	struct list_head list;
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nvfx_query *
+nvfx_query(struct pipe_query *pipe)
+{
+	return (struct nvfx_query *)pipe;
+}
+
+static struct pipe_query *
+nvfx_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nvfx_query *q;
+
+	q = CALLOC(1, sizeof(struct nvfx_query));
+	q->type = query_type;
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nvfx_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_query *q = nvfx_query(pq);
+
+	if (q->object)
+	{
+		nouveau_resource_free(&q->object);
+		LIST_DEL(&q->list);
+	}
+	FREE(q);
+}
+
+static void
+nvfx_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_query *q = nvfx_query(pq);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	uint64_t tmp;
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object)
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+
+	while (nouveau_resource_alloc(nvfx->screen->query_heap, 1, NULL, &q->object))
+	{
+		struct nvfx_query* oldestq;
+		assert(!LIST_IS_EMPTY(&nvfx->screen->query_list));
+		oldestq = LIST_ENTRY(struct nvfx_query, nvfx->screen->query_list.next, list);
+		pipe->get_query_result(pipe, (struct pipe_query*)oldestq, 1, &tmp);
+	}
+
+	LIST_ADDTAIL(&q->list, &nvfx->screen->query_list);
+
+	nouveau_notifier_reset(nvfx->screen->query, q->object->start);
+
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
+
+	q->ready = FALSE;
+}
+
+static void
+nvfx_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	struct nvfx_query *q = nvfx_query(pq);
+
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(chan);
+}
+
+static boolean
+nvfx_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_query *q = nvfx_query(pq);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nouveau_notifier_status(nvfx->screen->query,
+						 q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+
+			nouveau_notifier_wait_status(nvfx->screen->query,
+					q->object->start,
+					NV_NOTIFY_STATE_STATUS_COMPLETED, 0);
+		}
+
+		q->result = nouveau_notifier_return_val(nvfx->screen->query,
+							q->object->start);
+		q->ready = TRUE;
+		nouveau_resource_free(&q->object);
+		LIST_DEL(&q->list);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nvfx_init_query_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_query = nvfx_query_create;
+	nvfx->pipe.destroy_query = nvfx_query_destroy;
+	nvfx->pipe.begin_query = nvfx_query_begin;
+	nvfx->pipe.end_query = nvfx_query_end;
+	nvfx->pipe.get_query_result = nvfx_query_result;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.c b/src/gallium/drivers/nvfx/nvfx_resource.c
new file mode 100644
index 0000000000..10cdeed2a3
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_resource.c
@@ -0,0 +1,67 @@
+
+#include "pipe/p_context.h"
+#include "nvfx_resource.h"
+#include "nouveau/nouveau_screen.h"
+
+
+/* This doesn't look quite right - this query is supposed to ask
+ * whether the particular context has references to the resource in
+ * any unflushed rendering command buffer, and hence requires a
+ * pipe->flush() for serializing some modification to that resource.
+ *
+ * This seems to be answering the question of whether the resource is
+ * currently on hardware.
+ */
+static unsigned int
+nvfx_resource_is_referenced(struct pipe_context *pipe,
+			    struct pipe_resource *resource,
+			    unsigned face, unsigned level)
+{
+	return nouveau_reference_flags(nvfx_resource(resource)->bo);
+}
+
+static struct pipe_resource *
+nvfx_resource_create(struct pipe_screen *screen,
+		     const struct pipe_resource *template)
+{
+	if (template->target == PIPE_BUFFER)
+		return nvfx_buffer_create(screen, template);
+	else
+		return nvfx_miptree_create(screen, template);
+}
+
+static struct pipe_resource *
+nvfx_resource_from_handle(struct pipe_screen * screen,
+			  const struct pipe_resource *template,
+			  struct winsys_handle *whandle)
+{
+	if (template->target == PIPE_BUFFER)
+		return NULL;
+	else
+		return nvfx_miptree_from_handle(screen, template, whandle);
+}
+
+void
+nvfx_init_resource_functions(struct pipe_context *pipe)
+{
+	pipe->get_transfer = u_get_transfer_vtbl;
+	pipe->transfer_map = u_transfer_map_vtbl;
+	pipe->transfer_flush_region = u_transfer_flush_region_vtbl;
+	pipe->transfer_unmap = u_transfer_unmap_vtbl;
+	pipe->transfer_destroy = u_transfer_destroy_vtbl;
+	pipe->transfer_inline_write = u_transfer_inline_write_vtbl;
+	pipe->is_resource_referenced = nvfx_resource_is_referenced;
+}
+
+void
+nvfx_screen_init_resource_functions(struct pipe_screen *pscreen)
+{
+	pscreen->resource_create = nvfx_resource_create;
+	pscreen->resource_from_handle = nvfx_resource_from_handle;
+	pscreen->resource_get_handle = u_resource_get_handle_vtbl;
+	pscreen->resource_destroy = u_resource_destroy_vtbl;
+	pscreen->user_buffer_create = nvfx_user_buffer_create;
+   
+	pscreen->get_tex_surface = nvfx_miptree_surface_new;
+	pscreen->tex_surface_destroy = nvfx_miptree_surface_del;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.h b/src/gallium/drivers/nvfx/nvfx_resource.h
new file mode 100644
index 0000000000..a68c14cf3f
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_resource.h
@@ -0,0 +1,91 @@
+
+#ifndef NVFX_RESOURCE_H
+#define NVFX_RESOURCE_H
+
+#include "util/u_transfer.h"
+
+struct pipe_resource;
+struct nouveau_bo;
+
+
+/* This gets further specialized into either buffer or texture
+ * structures.  In the future we'll want to remove much of that
+ * distinction, but for now try to keep as close to the existing code
+ * as possible and use the vtbl struct to choose between the two
+ * underlying implementations.
+ */
+struct nvfx_resource {
+	struct pipe_resource base;
+	struct u_resource_vtbl *vtbl;
+	struct nouveau_bo *bo;
+};
+
+#define NVFX_MAX_TEXTURE_LEVELS  16
+
+struct nvfx_miptree {
+	struct nvfx_resource base;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[NVFX_MAX_TEXTURE_LEVELS];
+
+	unsigned image_nr;
+};
+
+static INLINE 
+struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
+{
+	return (struct nvfx_resource *)resource;
+}
+
+static INLINE struct nouveau_bo *
+nvfx_surface_buffer(struct pipe_surface *surf)
+{
+	struct nvfx_resource *mt = nvfx_resource(surf->texture);
+
+	return mt->bo;
+}
+
+
+void
+nvfx_init_resource_functions(struct pipe_context *pipe);
+
+void
+nvfx_screen_init_resource_functions(struct pipe_screen *pscreen);
+
+
+/* Internal:
+ */
+
+struct pipe_resource *
+nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt);
+
+struct pipe_resource *
+nvfx_miptree_from_handle(struct pipe_screen *pscreen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle);
+
+struct pipe_resource *
+nvfx_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template);
+
+struct pipe_resource *
+nvfx_user_buffer_create(struct pipe_screen *screen,
+			void *ptr,
+			unsigned bytes,
+			unsigned usage);
+
+
+
+void
+nvfx_miptree_surface_del(struct pipe_surface *ps);
+
+struct pipe_surface *
+nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags);
+
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
new file mode 100644
index 0000000000..9f03ab1833
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -0,0 +1,466 @@
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/u_simple_screen.h"
+
+#include "nouveau/nouveau_screen.h"
+
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+#include "nvfx_resource.h"
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+/* FIXME: It seems I should not include directly ../../winsys/drm/nouveau/drm/nouveau_drm_api.h
+* to get the pointer to the context front buffer, so I copied nouveau_winsys here.
+* nv30_screen_surface_format_supported() can then use it to enforce creating fbo
+* with same number of bits everywhere.
+*/
+struct nouveau_winsys {
+	struct pipe_winsys base;
+
+	struct pipe_screen *pscreen;
+
+	struct pipe_surface *front;
+};
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static int
+nvfx_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		/* TODO: check this */
+		return screen->is_nv4x ? 16 : 8;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return screen->is_nv4x ? 4 : 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0; /* We have 4 on nv40 - but unsupported currently */
+	case PIPE_CAP_TGSI_CONT_SUPPORTED:
+		return 0;
+	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+		return 16;
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+		/* TODO: on nv40 we have separate color masks */
+		/* TODO: nv40 mrt blending is probably broken */
+		return 0;
+	case PIPE_CAP_INDEP_BLEND_FUNC:
+		return 0;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+		return 1;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nvfx_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return screen->is_nv4x ? 16.0 : 8.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return screen->is_nv4x ? 16.0 : 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
+
+	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else
+	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
+		switch (format) {
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		case PIPE_FORMAT_X8Z24_UNORM:
+			return TRUE;
+		case PIPE_FORMAT_Z16_UNORM:
+			/* TODO: this nv30 limitation probably does not exist */
+			if (!screen->is_nv4x && front)
+				return (front->format == PIPE_FORMAT_B5G6R5_UNORM);
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_B5G5R5A1_UNORM:
+		case PIPE_FORMAT_B4G4R4A4_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_L8A8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		/* TODO: does nv30 support this? */
+		case PIPE_FORMAT_R16_SNORM:
+			return !!screen->is_nv4x;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+
+static void
+nvfx_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	nouveau_resource_destroy(&screen->vp_exec_heap);
+	nouveau_resource_destroy(&screen->vp_data_heap);
+	nouveau_resource_destroy(&screen->query_heap);
+	nouveau_notifier_free(&screen->query);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->eng3d);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
+
+	FREE(pscreen);
+}
+
+static void nv30_screen_init(struct nvfx_screen *screen)
+{
+	struct nouveau_channel *chan = screen->base.channel;
+	int i;
+
+	/* TODO: perhaps we should do some of this on nv40 too? */
+	for (i=1; i<8; i++) {
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1));
+		OUT_RING(chan, 0);
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_CLIP_VERT(i), 1));
+		OUT_RING(chan, 0);
+	}
+
+	OUT_RING(chan, RING_3D(0x220, 1));
+	OUT_RING(chan, 1);
+
+	OUT_RING(chan, RING_3D(0x03b0, 1));
+	OUT_RING(chan, 0x00100000);
+	OUT_RING(chan, RING_3D(0x1454, 1));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, RING_3D(0x1d80, 1));
+	OUT_RING(chan, 3);
+	OUT_RING(chan, RING_3D(0x1450, 1));
+	OUT_RING(chan, 0x00030004);
+
+	/* NEW */
+	OUT_RING(chan, RING_3D(0x1e98, 1));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, RING_3D(0x17e0, 3));
+	OUT_RING(chan, fui(0.0));
+	OUT_RING(chan, fui(0.0));
+	OUT_RING(chan, fui(1.0));
+	OUT_RING(chan, RING_3D(0x1f80, 16));
+	for (i=0; i<16; i++) {
+		OUT_RING(chan, (i==8) ? 0x0000ffff : 0);
+	}
+
+	OUT_RING(chan, RING_3D(0x120, 3));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, 1);
+	OUT_RING(chan, 2);
+
+	OUT_RING(chan, RING_3D(0x1d88, 1));
+	OUT_RING(chan, 0x00001200);
+
+	OUT_RING(chan, RING_3D(NV34TCL_RC_ENABLE, 1));
+	OUT_RING(chan, 0);
+
+	OUT_RING(chan, RING_3D(NV34TCL_DEPTH_RANGE_NEAR, 2));
+	OUT_RING(chan, fui(0.0));
+	OUT_RING(chan, fui(1.0));
+
+	OUT_RING(chan, RING_3D(NV34TCL_MULTISAMPLE_CONTROL, 1));
+	OUT_RING(chan, 0xffff0000);
+
+	/* enables use of vp rather than fixed-function somehow */
+	OUT_RING(chan, RING_3D(0x1e94, 1));
+	OUT_RING(chan, 0x13);
+}
+
+static void nv40_screen_init(struct nvfx_screen *screen)
+{
+	struct nouveau_channel *chan = screen->base.channel;
+
+	OUT_RING(chan, RING_3D(NV40TCL_DMA_COLOR2, 2));
+	OUT_RING(chan, screen->base.channel->vram->handle);
+	OUT_RING(chan, screen->base.channel->vram->handle);
+
+	OUT_RING(chan, RING_3D(0x1ea4, 3));
+	OUT_RING(chan, 0x00000010);
+	OUT_RING(chan, 0x01000100);
+	OUT_RING(chan, 0xff800006);
+
+	/* vtxprog output routing */
+	OUT_RING(chan, RING_3D(0x1fc4, 1));
+	OUT_RING(chan, 0x06144321);
+	OUT_RING(chan, RING_3D(0x1fc8, 2));
+	OUT_RING(chan, 0xedcba987);
+	OUT_RING(chan, 0x00000021);
+	OUT_RING(chan, RING_3D(0x1fd0, 1));
+	OUT_RING(chan, 0x00171615);
+	OUT_RING(chan, RING_3D(0x1fd4, 1));
+	OUT_RING(chan, 0x001b1a19);
+
+	OUT_RING(chan, RING_3D(0x1ef8, 1));
+	OUT_RING(chan, 0x0020ffff);
+	OUT_RING(chan, RING_3D(0x1d64, 1));
+	OUT_RING(chan, 0x00d30000);
+	OUT_RING(chan, RING_3D(0x1e94, 1));
+	OUT_RING(chan, 0x00000001);
+}
+
+static unsigned
+nvfx_screen_get_vertex_buffer_flags(struct nvfx_screen* screen)
+{
+	int vram_hack_default = 0;
+	int vram_hack;
+	// TODO: this is a bit of a guess; also add other cards that may need this hack.
+	// It may also depend on the specific card or the AGP/PCIe chipset.
+	if(screen->base.device->chipset == 0x47 /* G70 */
+		|| screen->base.device->chipset == 0x49 /* G71 */
+		|| screen->base.device->chipset == 0x46 /* G72 */
+		)
+		vram_hack_default = 1;
+	vram_hack = debug_get_bool_option("NOUVEAU_VTXIDX_IN_VRAM", vram_hack_default);
+
+#ifdef DEBUG
+	if(!vram_hack)
+	{
+		fprintf(stderr, "Some systems may experience graphics corruption due to randomly misplaced vertices.\n"
+			"If this is happening, export NOUVEAU_VTXIDX_IN_VRAM=1 may reduce or eliminate the problem\n");
+	}
+	else
+	{
+		fprintf(stderr, "A performance reducing hack is being used to help avoid graphics corruption.\n"
+			"You can try export NOUVEAU_VTXIDX_IN_VRAM=0 to disable it.\n");
+	}
+#endif
+
+	return vram_hack ? NOUVEAU_BO_VRAM : NOUVEAU_BO_GART;
+}
+
+struct pipe_screen *
+nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
+{
+	static const unsigned query_sizes[] = {(4096 - 4 * 32) / 32, 3 * 1024 / 32, 2 * 1024 / 32, 1024 / 32};
+	struct nvfx_screen *screen = CALLOC_STRUCT(nvfx_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
+	unsigned eng3d_class = 0;
+	int ret, i;
+
+	if (!screen)
+		return NULL;
+
+	pscreen = &screen->base.base;
+
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nvfx_screen_destroy;
+	pscreen->get_param = nvfx_screen_get_param;
+	pscreen->get_paramf = nvfx_screen_get_paramf;
+	pscreen->is_format_supported = nvfx_screen_surface_format_supported;
+	pscreen->context_create = nvfx_create;
+
+	switch (dev->chipset & 0xf0) {
+	case 0x30:
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0397;
+		else if (NV34TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0697;
+		else if (NV35TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0497;
+		break;
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV40TCL;
+		else if (NV4X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV44TCL;
+		screen->is_nv4x = ~0;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV44TCL;
+		screen->is_nv4x = ~0;
+		break;
+	}
+
+	if (!eng3d_class) {
+		NOUVEAU_ERR("Unknown nv3x/nv4x chipset: nv%02x\n", dev->chipset);
+		return NULL;
+	}
+
+	screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE);
+
+	screen->vertex_buffer_reloc_flags = nvfx_screen_get_vertex_buffer_flags(screen);
+
+	/* surely both nv3x and nv44 support index buffers too: find out how and test that */
+	if(eng3d_class == NV40TCL)
+		screen->index_buffer_reloc_flags = screen->vertex_buffer_reloc_flags;
+
+	if(!screen->force_swtnl && screen->vertex_buffer_reloc_flags == screen->index_buffer_reloc_flags)
+		screen->base.vertex_buffer_flags = screen->base.index_buffer_flags = screen->vertex_buffer_reloc_flags;
+
+	nvfx_screen_init_resource_functions(pscreen);
+
+	ret = nouveau_grobj_alloc(chan, 0xbeef3097, eng3d_class, &screen->eng3d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nvfx_surface_buffer;
+
+	/* Notifier for sync purposes */
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* Query objects */
+	for(i = 0; i < sizeof(query_sizes) / sizeof(query_sizes[0]); ++i)
+	{
+		ret = nouveau_notifier_alloc(chan, 0xbeef0302, query_sizes[i], &screen->query);
+		if(!ret)
+			break;
+	}
+
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	ret = nouveau_resource_init(&screen->query_heap, 0, query_sizes[i]);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	LIST_INITHEAD(&screen->query_list);
+
+	/* Vtxprog resources */
+	if (nouveau_resource_init(&screen->vp_exec_heap, 0, screen->is_nv4x ? 512 : 256) ||
+	    nouveau_resource_init(&screen->vp_data_heap, 0, 256)) {
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	BIND_RING(chan, screen->eng3d, 7);
+
+	/* Static eng3d initialisation */
+	/* note that we just started using the channel, so we must have space in the pushbuffer */
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_NOTIFY, 1));
+	OUT_RING(chan, screen->sync->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_TEXTURE0, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->gart->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR1, 1));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR0, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_VTXBUF0, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->gart->handle);
+
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_FENCE, 2));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, screen->query->handle);
+
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_IN_MEMORY7, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->vram->handle);
+
+	if(!screen->is_nv4x)
+		nv30_screen_init(screen);
+	else
+		nv40_screen_init(screen);
+
+	return pscreen;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h
new file mode 100644
index 0000000000..5e1c3945ae
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_screen.h
@@ -0,0 +1,43 @@
+#ifndef __NVFX_SCREEN_H__
+#define __NVFX_SCREEN_H__
+
+#include "util/u_double_list.h"
+#include "nouveau/nouveau_screen.h"
+#include "nv04_surface_2d.h"
+
+struct nvfx_context;
+
+struct nvfx_screen {
+	struct nouveau_screen base;
+
+	struct nouveau_winsys *nvws;
+
+	struct nvfx_context *cur_ctx;
+
+	unsigned is_nv4x; /* either 0 or ~0 */
+	boolean force_swtnl;
+	unsigned vertex_buffer_reloc_flags;
+	unsigned index_buffer_reloc_flags;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *eng3d;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+	struct list_head query_list;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+};
+
+static INLINE struct nvfx_screen *
+nvfx_screen(struct pipe_screen *screen)
+{
+	return (struct nvfx_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
new file mode 100644
index 0000000000..50830b3916
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -0,0 +1,429 @@
+#ifndef __NVFX_SHADER_H__
+#define __NVFX_SHADER_H__
+
+/* this will resolve to either the NV30 or the NV40 version
+ * depending on the current hardware */
+/* unusual, but very fast and compact method */
+#define NVFX_VP(c) ((NV30_VP_##c) + (nvfx->is_nv4x & ((NV40_VP_##c) - (NV30_VP_##c))))
+
+#define NVFX_VP_INST_SLOT_VEC 0
+#define NVFX_VP_INST_SLOT_SCA 1
+
+#define NVFX_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#define NVFX_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#define NVFX_VP_INST_IN_NORMAL  2
+#define NVFX_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#define NVFX_VP_INST_IN_COL1  4
+#define NVFX_VP_INST_IN_FOGC  5
+#define NVFX_VP_INST_IN_TC0  8
+#define NVFX_VP_INST_IN_TC(n)  (8+n)
+
+#define NVFX_VP_INST_SCA_OP_NOP 0x00
+#define NVFX_VP_INST_SCA_OP_MOV 0x01
+#define NVFX_VP_INST_SCA_OP_RCP 0x02
+#define NVFX_VP_INST_SCA_OP_RCC 0x03
+#define NVFX_VP_INST_SCA_OP_RSQ 0x04
+#define NVFX_VP_INST_SCA_OP_EXP 0x05
+#define NVFX_VP_INST_SCA_OP_LOG 0x06
+#define NVFX_VP_INST_SCA_OP_LIT 0x07
+#define NVFX_VP_INST_SCA_OP_BRA 0x09
+#define NVFX_VP_INST_SCA_OP_CAL 0x0B
+#define NVFX_VP_INST_SCA_OP_RET 0x0C
+#define NVFX_VP_INST_SCA_OP_LG2 0x0D
+#define NVFX_VP_INST_SCA_OP_EX2 0x0E
+#define NVFX_VP_INST_SCA_OP_SIN 0x0F
+#define NVFX_VP_INST_SCA_OP_COS 0x10
+
+#define NV40_VP_INST_SCA_OP_PUSHA 0x13
+#define NV40_VP_INST_SCA_OP_POPA 0x14
+
+#define NVFX_VP_INST_VEC_OP_NOP 0x00
+#define NVFX_VP_INST_VEC_OP_MOV 0x01
+#define NVFX_VP_INST_VEC_OP_MUL 0x02
+#define NVFX_VP_INST_VEC_OP_ADD 0x03
+#define NVFX_VP_INST_VEC_OP_MAD 0x04
+#define NVFX_VP_INST_VEC_OP_DP3 0x05
+#define NVFX_VP_INST_VEC_OP_DPH 0x06
+#define NVFX_VP_INST_VEC_OP_DP4 0x07
+#define NVFX_VP_INST_VEC_OP_DST 0x08
+#define NVFX_VP_INST_VEC_OP_MIN 0x09
+#define NVFX_VP_INST_VEC_OP_MAX 0x0A
+#define NVFX_VP_INST_VEC_OP_SLT 0x0B
+#define NVFX_VP_INST_VEC_OP_SGE 0x0C
+#define NVFX_VP_INST_VEC_OP_ARL 0x0D
+#define NVFX_VP_INST_VEC_OP_FRC 0x0E
+#define NVFX_VP_INST_VEC_OP_FLR 0x0F
+#define NVFX_VP_INST_VEC_OP_SEQ 0x10
+#define NVFX_VP_INST_VEC_OP_SFL 0x11
+#define NVFX_VP_INST_VEC_OP_SGT 0x12
+#define NVFX_VP_INST_VEC_OP_SLE 0x13
+#define NVFX_VP_INST_VEC_OP_SNE 0x14
+#define NVFX_VP_INST_VEC_OP_STR 0x15
+#define NVFX_VP_INST_VEC_OP_SSG 0x16
+#define NVFX_VP_INST_VEC_OP_ARR 0x17
+#define NVFX_VP_INST_VEC_OP_ARA 0x18
+
+#define NV40_VP_INST_VEC_OP_TXL 0x19
+
+/* DWORD 3 */
+#define NVFX_VP_INST_LAST                           (1 << 0)
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ *
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ *
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ *
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ *
+ * NV40 Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ */
+
+//== Opcode / Destination selection ==
+#define NVFX_FP_OP_PROGRAM_END          (1 << 0)
+#define NVFX_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+#define NV40_FP_OP_OUT_REG_MASK          (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NVFX_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NVFX_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NVFX_FP_OP_OUTMASK_SHIFT        9
+#define NVFX_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NVFX_FP_OP_OUT_X  (1<<9)
+#  define NVFX_FP_OP_OUT_Y  (1<<10)
+#  define NVFX_FP_OP_OUT_Z  (1<<11)
+#  define NVFX_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NVFX_FP_OP_INPUT_SRC_SHIFT        13
+#define NVFX_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NVFX_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NVFX_FP_OP_INPUT_SRC_COL0  0x1
+#  define NVFX_FP_OP_INPUT_SRC_COL1  0x2
+#  define NVFX_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NVFX_FP_OP_INPUT_SRC_TC0    0x4
+#  define NVFX_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#  define NV40_FP_OP_INPUT_SRC_FACING  0xE
+#define NVFX_FP_OP_TEX_UNIT_SHIFT        17
+#define NVFX_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NVFX_FP_OP_PRECISION_SHIFT        22
+#define NVFX_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NVFX_FP_PRECISION_FP32  0
+#   define NVFX_FP_PRECISION_FP16  1
+#   define NVFX_FP_PRECISION_FX12  2
+#define NVFX_FP_OP_OPCODE_SHIFT          24
+#define NVFX_FP_OP_OPCODE_MASK          (0x3F << 24)
+/* NV30/NV40 fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_NOP 0x00
+#define NVFX_FP_OP_OPCODE_MOV 0x01
+#define NVFX_FP_OP_OPCODE_MUL 0x02
+#define NVFX_FP_OP_OPCODE_ADD 0x03
+#define NVFX_FP_OP_OPCODE_MAD 0x04
+#define NVFX_FP_OP_OPCODE_DP3 0x05
+#define NVFX_FP_OP_OPCODE_DP4 0x06
+#define NVFX_FP_OP_OPCODE_DST 0x07
+#define NVFX_FP_OP_OPCODE_MIN 0x08
+#define NVFX_FP_OP_OPCODE_MAX 0x09
+#define NVFX_FP_OP_OPCODE_SLT 0x0A
+#define NVFX_FP_OP_OPCODE_SGE 0x0B
+#define NVFX_FP_OP_OPCODE_SLE 0x0C
+#define NVFX_FP_OP_OPCODE_SGT 0x0D
+#define NVFX_FP_OP_OPCODE_SNE 0x0E
+#define NVFX_FP_OP_OPCODE_SEQ 0x0F
+#define NVFX_FP_OP_OPCODE_FRC 0x10
+#define NVFX_FP_OP_OPCODE_FLR 0x11
+#define NVFX_FP_OP_OPCODE_KIL 0x12
+#define NVFX_FP_OP_OPCODE_PK4B 0x13
+#define NVFX_FP_OP_OPCODE_UP4B 0x14
+#define NVFX_FP_OP_OPCODE_DDX 0x15 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_DDY 0x16 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_TEX 0x17
+#define NVFX_FP_OP_OPCODE_TXP 0x18
+#define NVFX_FP_OP_OPCODE_TXD 0x19
+#define NVFX_FP_OP_OPCODE_RCP 0x1A
+#define NVFX_FP_OP_OPCODE_EX2 0x1C
+#define NVFX_FP_OP_OPCODE_LG2 0x1D
+#define NVFX_FP_OP_OPCODE_STR 0x20
+#define NVFX_FP_OP_OPCODE_SFL 0x21
+#define NVFX_FP_OP_OPCODE_COS 0x22
+#define NVFX_FP_OP_OPCODE_SIN 0x23
+#define NVFX_FP_OP_OPCODE_PK2H 0x24
+#define NVFX_FP_OP_OPCODE_UP2H 0x25
+#define NVFX_FP_OP_OPCODE_PK4UB 0x27
+#define NVFX_FP_OP_OPCODE_UP4UB 0x28
+#define NVFX_FP_OP_OPCODE_PK2US 0x29
+#define NVFX_FP_OP_OPCODE_UP2US 0x2A
+#define NVFX_FP_OP_OPCODE_DP2A 0x2E
+#define NVFX_FP_OP_OPCODE_TXB 0x31
+#define NVFX_FP_OP_OPCODE_DIV 0x3A
+
+/* NV30 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_RSQ_NV30 0x1B
+#define NVFX_FP_OP_OPCODE_LIT_NV30 0x1E
+#define NVFX_FP_OP_OPCODE_LRP_NV30 0x1F
+#define NVFX_FP_OP_OPCODE_POW_NV30 0x26
+#define NVFX_FP_OP_OPCODE_RFL_NV30 0x36
+
+/* NV40 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_TXL_NV40 0x2F
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+
+#define NVFX_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NVFX_FP_OP_OUT_ABS          (1 << 29)
+#define NVFX_FP_OP_COND_SWZ_W_SHIFT        27
+#define NVFX_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NVFX_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NVFX_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NVFX_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NVFX_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NVFX_FP_OP_COND_SWZ_X_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NVFX_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NVFX_FP_OP_COND_SHIFT          18
+#define NVFX_FP_OP_COND_MASK          (0x07 << 18)
+#  define NVFX_FP_OP_COND_FL  0
+#  define NVFX_FP_OP_COND_LT  1
+#  define NVFX_FP_OP_COND_EQ  2
+#  define NVFX_FP_OP_COND_LE  3
+#  define NVFX_FP_OP_COND_GT  4
+#  define NVFX_FP_OP_COND_NE  5
+#  define NVFX_FP_OP_COND_GE  6
+#  define NVFX_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NVFX_FP_OP_DST_SCALE_SHIFT        28
+#define NVFX_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NVFX_FP_OP_DST_SCALE_1X                                                0
+#define NVFX_FP_OP_DST_SCALE_2X                                                1
+#define NVFX_FP_OP_DST_SCALE_4X                                                2
+#define NVFX_FP_OP_DST_SCALE_8X                                                3
+#define NVFX_FP_OP_DST_SCALE_INV_2X                                            5
+#define NVFX_FP_OP_DST_SCALE_INV_4X                                            6
+#define NVFX_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+/* high order bits of SRC2 */
+#define NVFX_FP_OP_INDEX_INPUT          (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT        19
+#define NV40_FP_OP_ADDR_INDEX_MASK        (0xF << 19)
+
+//== Register selection ==
+#define NVFX_FP_REG_TYPE_SHIFT           0
+#define NVFX_FP_REG_TYPE_MASK           (3 << 0)
+#  define NVFX_FP_REG_TYPE_TEMP   0
+#  define NVFX_FP_REG_TYPE_INPUT  1
+#  define NVFX_FP_REG_TYPE_CONST  2
+#define NVFX_FP_REG_SRC_SHIFT            2
+#define NV30_FP_REG_SRC_MASK              (31 << 2)
+#define NV40_FP_REG_SRC_MASK              (63 << 2)
+#define NVFX_FP_REG_SRC_HALF            (1 << 8)
+#define NVFX_FP_REG_SWZ_ALL_SHIFT        9
+#define NVFX_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NVFX_FP_REG_SWZ_X_SHIFT          9
+#define NVFX_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NVFX_FP_REG_SWZ_Y_SHIFT          11
+#define NVFX_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NVFX_FP_REG_SWZ_Z_SHIFT          13
+#define NVFX_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NVFX_FP_REG_SWZ_W_SHIFT          15
+#define NVFX_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NVFX_FP_SWIZZLE_X  0
+#  define NVFX_FP_SWIZZLE_Y  1
+#  define NVFX_FP_SWIZZLE_Z  2
+#  define NVFX_FP_SWIZZLE_W  3
+#define NVFX_FP_REG_NEGATE          (1 << 17)
+
+#define NVFXSR_NONE	0
+#define NVFXSR_OUTPUT	1
+#define NVFXSR_INPUT	2
+#define NVFXSR_TEMP	3
+#define NVFXSR_CONST	4
+
+#define NVFX_COND_FL  0
+#define NVFX_COND_LT  1
+#define NVFX_COND_EQ  2
+#define NVFX_COND_LE  3
+#define NVFX_COND_GT  4
+#define NVFX_COND_NE  5
+#define NVFX_COND_GE  6
+#define NVFX_COND_TR  7
+
+/* Yes, this are ordered differently... */
+
+#define NVFX_VP_MASK_X 8
+#define NVFX_VP_MASK_Y 4
+#define NVFX_VP_MASK_Z 2
+#define NVFX_VP_MASK_W 1
+#define NVFX_VP_MASK_ALL 0xf
+
+#define NVFX_FP_MASK_X 1
+#define NVFX_FP_MASK_Y 2
+#define NVFX_FP_MASK_Z 4
+#define NVFX_FP_MASK_W 8
+#define NVFX_FP_MASK_ALL 0xf
+
+#define NVFX_SWZ_X 0
+#define NVFX_SWZ_Y 1
+#define NVFX_SWZ_Z 2
+#define NVFX_SWZ_W 3
+
+#define swz(s,x,y,z,w) nvfx_sr_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
+#define neg(s) nvfx_sr_neg((s))
+#define abs(s) nvfx_sr_abs((s))
+#define scale(s,v) nvfx_sr_scale((s), NVFX_FP_OP_DST_SCALE_##v)
+
+struct nvfx_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nvfx_sreg
+nvfx_sr(int type, int index)
+{
+	struct nvfx_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = 0,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = NVFX_COND_TR,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
+{
+	struct nvfx_sreg dst = src;
+
+	dst.swz[NVFX_SWZ_X] = src.swz[x];
+	dst.swz[NVFX_SWZ_Y] = src.swz[y];
+	dst.swz[NVFX_SWZ_Z] = src.swz[z];
+	dst.swz[NVFX_SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_neg(struct nvfx_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_abs(struct nvfx_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_scale(struct nvfx_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
new file mode 100644
index 0000000000..315de492da
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -0,0 +1,658 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "nvfx_tex.h"
+
+static void *
+nvfx_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_statebuf_builder sb = sb_init(bso->sb);
+
+	if (cso->rt[0].blend_enable) {
+		sb_method(sb, NV34TCL_BLEND_FUNC_ENABLE, 3);
+		sb_data(sb, 1);
+		sb_data(sb, (nvgl_blend_func(cso->rt[0].alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rt[0].rgb_src_factor));
+		sb_data(sb, nvgl_blend_func(cso->rt[0].alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rt[0].rgb_dst_factor));
+		if(nvfx->screen->base.device->chipset < 0x40) {
+			sb_method(sb, NV34TCL_BLEND_EQUATION, 1);
+			sb_data(sb, nvgl_blend_eqn(cso->rt[0].rgb_func));
+		} else {
+			sb_method(sb, NV40TCL_BLEND_EQUATION, 1);
+			sb_data(sb, nvgl_blend_eqn(cso->rt[0].alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rt[0].rgb_func));
+		}
+	} else {
+		sb_method(sb, NV34TCL_BLEND_FUNC_ENABLE, 1);
+		sb_data(sb, 0);
+	}
+
+	sb_method(sb, NV34TCL_COLOR_MASK, 1);
+	sb_data(sb, (((cso->rt[0].colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	/* TODO: add NV40 MRT color mask */
+
+	if (cso->logicop_enable) {
+		sb_method(sb, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		sb_data(sb, 1);
+		sb_data(sb, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		sb_method(sb, NV34TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		sb_data(sb, 0);
+	}
+
+	sb_method(sb, NV34TCL_DITHER_ENABLE, 1);
+	sb_data(sb, cso->dither ? 1 : 0);
+
+	bso->sb_len = sb_len(sb, bso->sb);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nvfx_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->blend = hwcso;
+	nvfx->dirty |= NVFX_NEW_BLEND;
+}
+
+static void
+nvfx_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_blend_state *bso = hwcso;
+
+	FREE(bso);
+}
+
+static void *
+nvfx_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_sampler_state *ps;
+
+	ps = MALLOC(sizeof(struct nvfx_sampler_state));
+
+	/* on nv30, we use this as an internal flag */
+	ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT;
+	ps->en = 0;
+	ps->filt = nvfx_tex_filter(cso);
+	ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT) |
+		    nvfx_tex_wrap_compare_mode(cso);
+	ps->bcol = nvfx_tex_border_color(cso->border_color);
+
+	if(nvfx->is_nv4x)
+		nv40_sampler_state_init(pipe, ps, cso);
+	else
+		nv30_sampler_state_init(pipe, ps, cso);
+
+	return (void *)ps;
+}
+
+static void
+nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nvfx->tex_sampler[unit] = sampler[unit];
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_samplers; unit++) {
+		nvfx->tex_sampler[unit] = NULL;
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_samplers = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+static void
+nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_set_fragment_sampler_views(struct pipe_context *pipe,
+				unsigned nr,
+				struct pipe_sampler_view **views)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            views[unit]);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_textures; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            NULL);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_textures = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+
+static struct pipe_sampler_view *
+nvfx_create_sampler_view(struct pipe_context *pipe,
+			 struct pipe_resource *texture,
+			 const struct pipe_sampler_view *templ)
+{
+	struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+	if (view) {
+		*view = *templ;
+		view->reference.count = 1;
+		view->texture = NULL;
+		pipe_resource_reference(&view->texture, texture);
+		view->context = pipe;
+	}
+
+	return view;
+}
+
+
+static void
+nvfx_sampler_view_destroy(struct pipe_context *pipe,
+			  struct pipe_sampler_view *view)
+{
+	pipe_resource_reference(&view->texture, NULL);
+	FREE(view);
+}
+
+static void *
+nvfx_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nvfx_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_statebuf_builder sb = sb_init(rsso->sb);
+
+	/*XXX: ignored:
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	sb_method(sb, NV34TCL_SHADE_MODEL, 1);
+	sb_data(sb, cso->flatshade ? NV34TCL_SHADE_MODEL_FLAT :
+				       NV34TCL_SHADE_MODEL_SMOOTH);
+
+	sb_method(sb, NV34TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+	sb_data(sb, cso->light_twoside);
+
+	sb_method(sb, NV34TCL_LINE_WIDTH, 2);
+	sb_data(sb, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	sb_data(sb, cso->line_smooth ? 1 : 0);
+	sb_method(sb, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	sb_data(sb, cso->line_stipple_enable ? 1 : 0);
+	sb_data(sb, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	sb_method(sb, NV34TCL_POINT_SIZE, 1);
+	sb_data(sb, fui(cso->point_size));
+
+	sb_method(sb, NV34TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		sb_data(sb, nvgl_polygon_mode(cso->fill_ccw));
+		sb_data(sb, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			sb_data(sb, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			sb_data(sb, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			sb_data(sb, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			sb_data(sb, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		sb_data(sb, NV34TCL_FRONT_FACE_CCW);
+	} else {
+		sb_data(sb, nvgl_polygon_mode(cso->fill_cw));
+		sb_data(sb, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			sb_data(sb, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			sb_data(sb, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			sb_data(sb, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			sb_data(sb, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		sb_data(sb, NV34TCL_FRONT_FACE_CW);
+	}
+	sb_data(sb, cso->poly_smooth ? 1 : 0);
+	sb_data(sb, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	sb_method(sb, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	sb_data(sb, cso->poly_stipple_enable ? 1 : 0);
+
+	sb_method(sb, NV34TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		sb_data(sb, 1);
+	else
+		sb_data(sb, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		sb_data(sb, 1);
+	else
+		sb_data(sb, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		sb_data(sb, 1);
+	else
+		sb_data(sb, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		sb_method(sb, NV34TCL_POLYGON_OFFSET_FACTOR, 2);
+		sb_data(sb, fui(cso->offset_scale));
+		sb_data(sb, fui(cso->offset_units * 2));
+	}
+
+	sb_method(sb, NV34TCL_POINT_SPRITE, 1);
+	if (cso->point_quad_rasterization) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if ((cso->sprite_coord_enable >> i) & 1)
+				psctl |= (1 << (8 + i));
+		}
+
+		sb_data(sb, psctl);
+	} else {
+		sb_data(sb, 0);
+	}
+
+	rsso->pipe = *cso;
+	rsso->sb_len = sb_len(sb, rsso->sb);
+	return (void *)rsso;
+}
+
+static void
+nvfx_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	if(nvfx->rasterizer && hwcso)
+	{
+		if(!nvfx->rasterizer || ((struct nvfx_rasterizer_state*)hwcso)->pipe.scissor
+					!= nvfx->rasterizer->pipe.scissor)
+		{
+			nvfx->dirty |= NVFX_NEW_SCISSOR;
+			nvfx->draw_dirty |= NVFX_NEW_SCISSOR;
+		}
+
+		if(((struct nvfx_rasterizer_state*)hwcso)->pipe.poly_stipple_enable
+					!= nvfx->rasterizer->pipe.poly_stipple_enable)
+		{
+			nvfx->dirty |= NVFX_NEW_STIPPLE;
+			nvfx->draw_dirty |= NVFX_NEW_STIPPLE;
+		}
+	}
+
+	nvfx->rasterizer = hwcso;
+	nvfx->dirty |= NVFX_NEW_RAST;
+	nvfx->draw_dirty |= NVFX_NEW_RAST;
+}
+
+static void
+nvfx_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_rasterizer_state *rsso = hwcso;
+
+	FREE(rsso);
+}
+
+static void *
+nvfx_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nvfx_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_statebuf_builder sb = sb_init(zsaso->sb);
+
+	sb_method(sb, NV34TCL_DEPTH_FUNC, 3);
+	sb_data  (sb, nvgl_comparison_op(cso->depth.func));
+	sb_data  (sb, cso->depth.writemask ? 1 : 0);
+	sb_data  (sb, cso->depth.enabled ? 1 : 0);
+
+	sb_method(sb, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	sb_data  (sb, cso->alpha.enabled ? 1 : 0);
+	sb_data  (sb, nvgl_comparison_op(cso->alpha.func));
+	sb_data  (sb, float_to_ubyte(cso->alpha.ref_value));
+
+	if (cso->stencil[0].enabled) {
+		sb_method(sb, NV34TCL_STENCIL_FRONT_ENABLE, 3);
+		sb_data  (sb, cso->stencil[0].enabled ? 1 : 0);
+		sb_data  (sb, cso->stencil[0].writemask);
+		sb_data  (sb, nvgl_comparison_op(cso->stencil[0].func));
+		sb_method(sb, NV34TCL_STENCIL_FRONT_FUNC_MASK, 4);
+		sb_data  (sb, cso->stencil[0].valuemask);
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[0].fail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		sb_method(sb, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+		sb_data  (sb, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		sb_method(sb, NV34TCL_STENCIL_BACK_ENABLE, 3);
+		sb_data  (sb, cso->stencil[1].enabled ? 1 : 0);
+		sb_data  (sb, cso->stencil[1].writemask);
+		sb_data  (sb, nvgl_comparison_op(cso->stencil[1].func));
+		sb_method(sb, NV34TCL_STENCIL_BACK_FUNC_MASK, 4);
+		sb_data  (sb, cso->stencil[1].valuemask);
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[1].fail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		sb_method(sb, NV34TCL_STENCIL_BACK_ENABLE, 1);
+		sb_data  (sb, 0);
+	}
+
+	zsaso->pipe = *cso;
+	zsaso->sb_len = sb_len(sb, zsaso->sb);
+	return (void *)zsaso;
+}
+
+static void
+nvfx_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->zsa = hwcso;
+	nvfx->dirty |= NVFX_NEW_ZSA;
+}
+
+static void
+nvfx_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_zsa_state *zsaso = hwcso;
+
+	FREE(zsaso);
+}
+
+static void *
+nvfx_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->vertprog = hwcso;
+	nvfx->dirty |= NVFX_NEW_VERTPROG;
+	nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
+}
+
+static void
+nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nvfx->draw, vp->draw);
+	nvfx_vertprog_destroy(nvfx, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nvfx_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nvfx_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nvfx_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->fragprog = hwcso;
+	nvfx->dirty |= NVFX_NEW_FRAGPROG;
+}
+
+static void
+nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_fragment_program *fp = hwcso;
+
+	nvfx_fragprog_destroy(nvfx, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nvfx_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->blend_colour = *bcol;
+	nvfx->dirty |= NVFX_NEW_BCOL;
+}
+
+static void
+nvfx_set_stencil_ref(struct pipe_context *pipe,
+		     const struct pipe_stencil_ref *sr)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->stencil_ref = *sr;
+	nvfx->dirty |= NVFX_NEW_SR;
+}
+
+static void
+nvfx_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->clip = *clip;
+	nvfx->dirty |= NVFX_NEW_UCP;
+	nvfx->draw_dirty |= NVFX_NEW_UCP;
+}
+
+static void
+nvfx_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 struct pipe_resource *buf )
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->constbuf[shader] = buf;
+	nvfx->constbuf_nr[shader] = buf->width0 / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nvfx->dirty |= NVFX_NEW_VERTCONST;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nvfx->dirty |= NVFX_NEW_FRAGCONST;
+	}
+}
+
+static void
+nvfx_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->framebuffer = *fb;
+	nvfx->dirty |= NVFX_NEW_FB;
+}
+
+static void
+nvfx_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	memcpy(nvfx->stipple, stipple->stipple, 4 * 32);
+	nvfx->dirty |= NVFX_NEW_STIPPLE;
+}
+
+static void
+nvfx_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->scissor = *s;
+	nvfx->dirty |= NVFX_NEW_SCISSOR;
+}
+
+static void
+nvfx_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->viewport = *vpt;
+	nvfx->dirty |= NVFX_NEW_VIEWPORT;
+	nvfx->draw_dirty |= NVFX_NEW_VIEWPORT;
+}
+
+static void
+nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	memcpy(nvfx->vtxbuf, vb, sizeof(*vb) * count);
+	nvfx->vtxbuf_nr = count;
+
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
+
+static void *
+nvfx_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
+
+	assert(num_elements < 16); /* not doing fallbacks yet */
+	cso->num_elements = num_elements;
+	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
+
+/*	nvfx_vtxelt_construct(cso);*/
+
+	return (void *)cso;
+}
+
+static void
+nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->vtxelt = hwcso;
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	/*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/
+}
+
+void
+nvfx_init_state_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_blend_state = nvfx_blend_state_create;
+	nvfx->pipe.bind_blend_state = nvfx_blend_state_bind;
+	nvfx->pipe.delete_blend_state = nvfx_blend_state_delete;
+
+	nvfx->pipe.create_sampler_state = nvfx_sampler_state_create;
+	nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind;
+	nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete;
+	nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views;
+        nvfx->pipe.create_sampler_view = nvfx_create_sampler_view;
+        nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy;
+
+	nvfx->pipe.create_rasterizer_state = nvfx_rasterizer_state_create;
+	nvfx->pipe.bind_rasterizer_state = nvfx_rasterizer_state_bind;
+	nvfx->pipe.delete_rasterizer_state = nvfx_rasterizer_state_delete;
+
+	nvfx->pipe.create_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_create;
+	nvfx->pipe.bind_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_bind;
+	nvfx->pipe.delete_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_delete;
+
+	nvfx->pipe.create_vs_state = nvfx_vp_state_create;
+	nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
+	nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
+
+	nvfx->pipe.create_fs_state = nvfx_fp_state_create;
+	nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
+	nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
+
+	nvfx->pipe.set_blend_color = nvfx_set_blend_color;
+        nvfx->pipe.set_stencil_ref = nvfx_set_stencil_ref;
+	nvfx->pipe.set_clip_state = nvfx_set_clip_state;
+	nvfx->pipe.set_constant_buffer = nvfx_set_constant_buffer;
+	nvfx->pipe.set_framebuffer_state = nvfx_set_framebuffer_state;
+	nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple;
+	nvfx->pipe.set_scissor_state = nvfx_set_scissor_state;
+	nvfx->pipe.set_viewport_state = nvfx_set_viewport_state;
+
+	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
+	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
+	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
+
+	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
new file mode 100644
index 0000000000..9ceb2577ec
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -0,0 +1,77 @@
+#ifndef __NVFX_STATE_H__
+#define __NVFX_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+#include "nouveau/nouveau_statebuf.h"
+
+struct nvfx_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nvfx_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nvfx_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nvfx_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nvfx_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+};
+
+struct nvfx_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nvfx_fragment_program_bo {
+	struct nvfx_fragment_program_bo* next;
+	struct nouveau_bo* bo;
+	char insn[] __attribute__((aligned(16)));
+};
+
+struct nvfx_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nvfx_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	uint32_t fp_control;
+
+	unsigned bo_prog_idx;
+	unsigned prog_size;
+	unsigned progs_per_bo;
+	struct nvfx_fragment_program_bo* fpbo;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state_blend.c b/src/gallium/drivers/nvfx/nvfx_state_blend.c
new file mode 100644
index 0000000000..fe34e98364
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_blend.c
@@ -0,0 +1,22 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_blend_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	sb_emit(chan, nvfx->blend->sb, nvfx->blend->sb_len);
+}
+
+void
+nvfx_state_blend_colour_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct pipe_blend_color *bcol = &nvfx->blend_colour;
+
+	WAIT_RING(chan, 2);
+	OUT_RING(chan, RING_3D(NV34TCL_BLEND_COLOR, 1));
+	OUT_RING(chan, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
new file mode 100644
index 0000000000..f91ae19ecd
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -0,0 +1,180 @@
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "draw/draw_context.h"
+
+static boolean
+nvfx_state_validate_common(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned dirty = nvfx->dirty;
+
+	if(nvfx != nvfx->screen->cur_ctx)
+		dirty = ~0;
+
+	if(nvfx->render_mode == HW)
+	{
+		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_VERTCONST | NVFX_NEW_UCP))
+		{
+			if(!nvfx_vertprog_validate(nvfx))
+				return FALSE;
+		}
+
+		if(dirty & (NVFX_NEW_ARRAYS))
+		{
+			if(!nvfx_vbo_validate(nvfx))
+				return FALSE;
+		}
+	}
+	else
+	{
+		/* TODO: this looks a bit misdesigned */
+		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
+			nvfx_vertprog_validate(nvfx);
+
+		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG))
+			nvfx_vtxfmt_validate(nvfx);
+	}
+
+	if(dirty & NVFX_NEW_FB)
+		nvfx_state_framebuffer_validate(nvfx);
+
+	if(dirty & NVFX_NEW_RAST)
+		sb_emit(chan, nvfx->rasterizer->sb, nvfx->rasterizer->sb_len);
+
+	if(dirty & NVFX_NEW_SCISSOR)
+		nvfx_state_scissor_validate(nvfx);
+
+	if(dirty & NVFX_NEW_STIPPLE)
+		nvfx_state_stipple_validate(nvfx);
+
+	if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))
+		nvfx_fragprog_validate(nvfx);
+
+	if(dirty & NVFX_NEW_SAMPLER)
+		nvfx_fragtex_validate(nvfx);
+
+	if(dirty & NVFX_NEW_BLEND)
+		sb_emit(chan, nvfx->blend->sb, nvfx->blend->sb_len);
+
+	if(dirty & NVFX_NEW_BCOL)
+		nvfx_state_blend_colour_validate(nvfx);
+
+	if(dirty & NVFX_NEW_ZSA)
+		sb_emit(chan, nvfx->zsa->sb, nvfx->zsa->sb_len);
+
+	if(dirty & NVFX_NEW_SR)
+		nvfx_state_sr_validate(nvfx);
+
+/* Having this depend on FB looks wrong, but it seems
+   necessary to make this work on nv3x
+   TODO: find the right fix
+*/
+	if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_FB))
+		nvfx_state_viewport_validate(nvfx);
+
+	/* TODO: could nv30 need this or something similar too? */
+	if((dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SAMPLER)) && nvfx->is_nv4x) {
+		WAIT_RING(chan, 4);
+		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+		OUT_RING(chan, 2);
+		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+		OUT_RING(chan, 1);
+	}
+	nvfx->dirty = 0;
+	return TRUE;
+}
+
+void
+nvfx_state_emit(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	/* we need to ensure there is enough space to output relocations in one go */
+	unsigned max_relocs = 0
+	      + 16 /* vertex buffers, incl. dma flag */
+	      + 2 /* index buffer plus format+dma flag */
+	      + 2 * 5 /* 4 cbufs + zsbuf, plus dma objects */
+	      + 2 * 16 /* fragment textures plus format+dma flag */
+	      + 2 * 4 /* vertex textures plus format+dma flag */
+	      + 1 /* fragprog incl dma flag */
+	      ;
+	MARK_RING(chan, max_relocs * 2, max_relocs * 2);
+	nvfx_state_relocate(nvfx);
+}
+
+void
+nvfx_state_relocate(struct nvfx_context *nvfx)
+{
+	nvfx_framebuffer_relocate(nvfx);
+	nvfx_fragtex_relocate(nvfx);
+	nvfx_fragprog_relocate(nvfx);
+	if (nvfx->render_mode == HW)
+		nvfx_vbo_relocate(nvfx);
+}
+
+boolean
+nvfx_state_validate(struct nvfx_context *nvfx)
+{
+	boolean was_sw = nvfx->fallback_swtnl ? TRUE : FALSE;
+
+	if (nvfx->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nvfx->fallback_swtnl & nvfx->dirty)
+				!= nvfx->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nvfx->dirty |= (NVFX_NEW_VIEWPORT |
+				NVFX_NEW_VERTPROG |
+				NVFX_NEW_ARRAYS);
+		nvfx->render_mode = HW;
+	}
+
+	if(!nvfx_state_validate_common(nvfx))
+		return FALSE;
+
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nvfx_state_validate_swtnl(struct nvfx_context *nvfx)
+{
+	struct draw_context *draw = nvfx->draw;
+
+	/* Setup for swtnl */
+	if (nvfx->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nvfx->fallback_swtnl);
+		nvfx->pipe.flush(&nvfx->pipe, 0, NULL);
+		nvfx->dirty |= (NVFX_NEW_VIEWPORT |
+				NVFX_NEW_VERTPROG |
+				NVFX_NEW_ARRAYS);
+		nvfx->render_mode = SWTNL;
+	}
+
+	if (nvfx->draw_dirty & NVFX_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nvfx->vertprog->draw);
+
+	if (nvfx->draw_dirty & NVFX_NEW_RAST)
+           draw_set_rasterizer_state(draw, &nvfx->rasterizer->pipe,
+                                     nvfx->rasterizer);
+
+	if (nvfx->draw_dirty & NVFX_NEW_UCP)
+		draw_set_clip_state(draw, &nvfx->clip);
+
+	if (nvfx->draw_dirty & NVFX_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nvfx->viewport);
+
+	if (nvfx->draw_dirty & NVFX_NEW_ARRAYS) {
+		draw_set_vertex_buffers(draw, nvfx->vtxbuf_nr, nvfx->vtxbuf);
+		draw_set_vertex_elements(draw, nvfx->vtxelt->num_elements, nvfx->vtxelt->pipe);
+	}
+
+	nvfx_state_validate_common(nvfx);
+
+	nvfx->draw_dirty = 0;
+	return TRUE;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
new file mode 100644
index 0000000000..8c215980e2
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -0,0 +1,250 @@
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+#include "nouveau/nouveau_util.h"
+
+
+
+void
+nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	uint32_t rt_enable = 0, rt_format = 0;
+	int i, colour_format = 0, zeta_format = 0;
+	int depth_only = 0;
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+	int colour_bits = 32, zeta_bits = 32;
+
+	if(!nvfx->is_nv4x)
+		assert(fb->nr_cbufs <= 2);
+	else
+		assert(fb->nr_cbufs <= 4);
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (colour_format)
+			assert(colour_format == fb->cbufs[i]->format);
+		else
+			colour_format = fb->cbufs[i]->format;
+
+		rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+		nvfx->hw_rt[i].bo = nvfx_surface_buffer(fb->cbufs[i]);
+		nvfx->hw_rt[i].offset = fb->cbufs[i]->offset;
+		nvfx->hw_rt[i].pitch = ((struct nv04_surface *)fb->cbufs[i])->pitch;
+	}
+	for(; i < 4; ++i)
+		nvfx->hw_rt[i].bo = 0;
+
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR1 |
+			 NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		nvfx->hw_zeta.bo = nvfx_surface_buffer(fb->zsbuf);
+		nvfx->hw_zeta.offset = fb->zsbuf->offset;
+		nvfx->hw_zeta.pitch = ((struct nv04_surface *)fb->zsbuf)->pitch;
+	}
+	else
+		nvfx->hw_zeta.bo = 0;
+
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR0 | NV34TCL_RT_ENABLE_COLOR1 |
+		NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3)) {
+		/* Render to at least a colour buffer */
+		if (!(fb->cbufs[0]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+			for (i = 1; i < fb->nr_cbufs; i++)
+				assert(!(fb->cbufs[i]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR));
+
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(fb->cbufs[0]->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(fb->cbufs[0]->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else if (fb->zsbuf) {
+		depth_only = 1;
+
+		/* Render to depth buffer only */
+		if (!(fb->zsbuf->texture->_usage & NVFX_RESOURCE_FLAG_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(fb->zsbuf->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(fb->zsbuf->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else {
+		return;
+	}
+
+	switch (colour_format) {
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
+		break;
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		colour_bits = 16;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		zeta_bits = 16;
+		break;
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if ((!nvfx->is_nv4x) && colour_bits > zeta_bits) {
+		/* TODO: does this limitation really exist?
+		   TODO: can it be worked around somehow? */
+		assert(0);
+	}
+
+	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0)
+		|| ((!nvfx->is_nv4x) && depth_only)) {
+		struct nvfx_render_target *rt0 = (depth_only ? &nvfx->hw_zeta : &nvfx->hw_rt[0]);
+		uint32_t pitch = rt0->pitch;
+
+		if(!nvfx->is_nv4x)
+		{
+			if (nvfx->hw_zeta.bo) {
+				pitch |= (nvfx->hw_zeta.pitch << 16);
+			} else {
+				pitch |= (pitch << 16);
+			}
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR0, 1));
+		OUT_RELOC(chan, rt0->bo, 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		OUT_RING(chan, RING_3D(NV34TCL_COLOR0_PITCH, 2));
+		OUT_RING(chan, pitch);
+		OUT_RELOC(chan, rt0->bo,
+			      rt0->offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR1, 1));
+		OUT_RELOC(chan, nvfx->hw_rt[1].bo, 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		OUT_RING(chan, RING_3D(NV34TCL_COLOR1_OFFSET, 2));
+		OUT_RELOC(chan, nvfx->hw_rt[1].bo,
+				nvfx->hw_rt[1].offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+		OUT_RING(chan, nvfx->hw_rt[1].pitch);
+	}
+
+	if(nvfx->is_nv4x)
+	{
+		if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+			OUT_RING(chan, RING_3D(NV40TCL_DMA_COLOR2, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[2].bo, 0,
+				      rt_flags | NOUVEAU_BO_OR,
+				      chan->vram->handle, chan->gart->handle);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR2_OFFSET, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[2].bo,
+				      nvfx->hw_rt[2].offset, rt_flags | NOUVEAU_BO_LOW,
+				      0, 0);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR2_PITCH, 1));
+			OUT_RING(chan, nvfx->hw_rt[2].pitch);
+		}
+
+		if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+			OUT_RING(chan, RING_3D(NV40TCL_DMA_COLOR3, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[3].bo, 0,
+				      rt_flags | NOUVEAU_BO_OR,
+				      chan->vram->handle, chan->gart->handle);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR3_OFFSET, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[3].bo,
+					nvfx->hw_rt[3].offset, rt_flags | NOUVEAU_BO_LOW,
+				      0, 0);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR3_PITCH, 1));
+			OUT_RING(chan, nvfx->hw_rt[3].pitch);
+		}
+	}
+
+	if (zeta_format) {
+		OUT_RING(chan, RING_3D(NV34TCL_DMA_ZETA, 1));
+		OUT_RELOC(chan, nvfx->hw_zeta.bo, 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		OUT_RING(chan, RING_3D(NV34TCL_ZETA_OFFSET, 1));
+		/* TODO: reverse engineer LMA */
+		OUT_RELOC(chan, nvfx->hw_zeta.bo,
+			     nvfx->hw_zeta.offset, rt_flags | NOUVEAU_BO_LOW, 0, 0);
+	        if(nvfx->is_nv4x) {
+			OUT_RING(chan, RING_3D(NV40TCL_ZETA_PITCH, 1));
+			OUT_RING(chan, nvfx->hw_zeta.pitch);
+		}
+	}
+
+	OUT_RING(chan, RING_3D(NV34TCL_RT_ENABLE, 1));
+	OUT_RING(chan, rt_enable);
+	OUT_RING(chan, RING_3D(NV34TCL_RT_HORIZ, 3));
+	OUT_RING(chan, (w << 16) | 0);
+	OUT_RING(chan, (h << 16) | 0);
+	OUT_RING(chan, rt_format);
+	OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_HORIZ, 2));
+	OUT_RING(chan, (w << 16) | 0);
+	OUT_RING(chan, (h << 16) | 0);
+	OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2));
+	OUT_RING(chan, ((w - 1) << 16) | 0);
+	OUT_RING(chan, ((h - 1) << 16) | 0);
+	OUT_RING(chan, RING_3D(0x1d88, 1));
+	OUT_RING(chan, (1 << 12) | h);
+
+	if(!nvfx->is_nv4x) {
+		/* Wonder why this is needed, context should all be set to zero on init */
+		/* TODO: we can most likely remove this, after putting it in context init */
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TX_ORIGIN, 1));
+		OUT_RING(chan, 0);
+	}
+}
+
+void
+nvfx_framebuffer_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	rt_flags |= NOUVEAU_BO_DUMMY;
+	MARK_RING(chan, 20, 20);
+
+#define DO_(var, pfx, name) \
+	if(var.bo) { \
+		OUT_RELOC(chan, var.bo, RING_3D(pfx##TCL_DMA_##name, 1), rt_flags, 0, 0); \
+		OUT_RELOC(chan, var.bo, 0, \
+			rt_flags | NOUVEAU_BO_OR, \
+			chan->vram->handle, chan->gart->handle); \
+		OUT_RELOC(chan, var.bo, RING_3D(pfx##TCL_##name##_OFFSET, 1), rt_flags, 0, 0); \
+		OUT_RELOC(chan, var.bo, \
+			var.offset, rt_flags | NOUVEAU_BO_LOW, \
+			0, 0); \
+	}
+
+#define DO(pfx, num) DO_(nvfx->hw_rt[num], pfx, COLOR##num)
+	DO(NV34, 0);
+	DO(NV34, 1);
+	DO(NV40, 2);
+	DO(NV40, 3);
+
+	DO_(nvfx->hw_zeta, NV34, ZETA);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c b/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c
new file mode 100644
index 0000000000..7f14ae85d5
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c
@@ -0,0 +1,9 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_rasterizer_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	sb_emit(chan, nvfx->rasterizer->sb, nvfx->rasterizer->sb_len);
+}
+
diff --git a/src/gallium/drivers/nvfx/nvfx_state_scissor.c b/src/gallium/drivers/nvfx/nvfx_state_scissor.c
new file mode 100644
index 0000000000..9077266120
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_scissor.c
@@ -0,0 +1,23 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_scissor_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nvfx->scissor;
+
+	if ((rast->scissor == 0 && nvfx->state.scissor_enabled == 0))
+		return;
+	nvfx->state.scissor_enabled = rast->scissor;
+
+	WAIT_RING(chan, 3);
+	OUT_RING(chan, RING_3D(NV34TCL_SCISSOR_HORIZ, 2));
+	if (nvfx->state.scissor_enabled) {
+		OUT_RING(chan, ((s->maxx - s->minx) << 16) | s->minx);
+		OUT_RING(chan, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		OUT_RING(chan, 4096 << 16);
+		OUT_RING(chan, 4096 << 16);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_stipple.c b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
new file mode 100644
index 0000000000..4da968f093
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
@@ -0,0 +1,26 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_stipple_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
+
+	if ((rast->poly_stipple_enable == 0 && nvfx->state.stipple_enabled == 0))
+		return;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		WAIT_RING(chan, 35);
+		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
+		OUT_RING(chan, 1);
+		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32));
+		for (i = 0; i < 32; i++)
+			OUT_RING(chan, nvfx->stipple[i]);
+	} else {
+		WAIT_RING(chan, 2);
+		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
+		OUT_RING(chan, 0);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_viewport.c b/src/gallium/drivers/nvfx/nvfx_state_viewport.c
new file mode 100644
index 0000000000..e983b16f32
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_viewport.c
@@ -0,0 +1,35 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_viewport_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct pipe_viewport_state *vpt = &nvfx->viewport;
+
+	WAIT_RING(chan, 11);
+	if(nvfx->render_mode == HW) {
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TRANSLATE_X, 8));
+		OUT_RINGf(chan, vpt->translate[0]);
+		OUT_RINGf(chan, vpt->translate[1]);
+		OUT_RINGf(chan, vpt->translate[2]);
+		OUT_RINGf(chan, vpt->translate[3]);
+		OUT_RINGf(chan, vpt->scale[0]);
+		OUT_RINGf(chan, vpt->scale[1]);
+		OUT_RINGf(chan, vpt->scale[2]);
+		OUT_RINGf(chan, vpt->scale[3]);
+		OUT_RING(chan, RING_3D(0x1d78, 1));
+		OUT_RING(chan, 1);
+	} else {
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TRANSLATE_X, 8));
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RING(chan, RING_3D(0x1d78, 1));
+		OUT_RING(chan, nvfx->is_nv4x ? 0x110 : 1);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_zsa.c b/src/gallium/drivers/nvfx/nvfx_state_zsa.c
new file mode 100644
index 0000000000..608605d32b
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_zsa.c
@@ -0,0 +1,21 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_zsa_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	sb_emit(chan, nvfx->zsa->sb, nvfx->zsa->sb_len);
+}
+
+void
+nvfx_state_sr_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct pipe_stencil_ref *sr = &nvfx->stencil_ref;
+
+	WAIT_RING(chan, 4);
+	OUT_RING(chan, RING_3D(NV34TCL_STENCIL_FRONT_FUNC_REF, 1));
+	OUT_RING(chan, sr->ref_value[0]);
+	OUT_RING(chan, RING_3D(NV34TCL_STENCIL_BACK_FUNC_REF, 1));
+	OUT_RING(chan, sr->ref_value[1]);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
new file mode 100644
index 0000000000..2e115650ae
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -0,0 +1,61 @@
+
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "nvfx_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+static void
+nvfx_surface_copy(struct pipe_context *pipe,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nvfx_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nvfx_init_surface_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.surface_copy = nvfx_surface_copy;
+	nvfx->pipe.surface_fill = nvfx_surface_fill;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_tex.h b/src/gallium/drivers/nvfx/nvfx_tex.h
new file mode 100644
index 0000000000..69187a79e7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_tex.h
@@ -0,0 +1,133 @@
+#ifndef NVFX_TEX_H_
+#define NVFX_TEX_H_
+
+static inline unsigned
+nvfx_tex_wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static inline unsigned
+nvfx_tex_wrap_compare_mode(const struct pipe_sampler_state* cso)
+{
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			return NV34TCL_TX_WRAP_RCOMP_NEVER;
+		case PIPE_FUNC_GREATER:
+			return NV34TCL_TX_WRAP_RCOMP_GREATER;
+		case PIPE_FUNC_EQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_EQUAL;
+		case PIPE_FUNC_GEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+		case PIPE_FUNC_LESS:
+			return NV34TCL_TX_WRAP_RCOMP_LESS;
+		case PIPE_FUNC_NOTEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+		case PIPE_FUNC_LEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+		case PIPE_FUNC_ALWAYS:
+			return NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+static inline unsigned nvfx_tex_filter(const struct pipe_sampler_state* cso)
+{
+	unsigned filter = 0;
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+	return filter;
+}
+
+static inline unsigned nvfx_tex_border_color(const float* border_color)
+{
+	return ((float_to_ubyte(border_color[3]) << 24) |
+		    (float_to_ubyte(border_color[0]) << 16) |
+		    (float_to_ubyte(border_color[1]) <<  8) |
+		    (float_to_ubyte(border_color[2]) <<  0));
+}
+
+struct nvfx_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+#endif /* NVFX_TEX_H_ */
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c
new file mode 100644
index 0000000000..a776ab5831
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.c
@@ -0,0 +1,205 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+#include "nvfx_state.h"
+#include "nvfx_resource.h"
+#include "nvfx_transfer.h"
+
+struct nvfx_transfer {
+	struct pipe_transfer base;
+	struct pipe_surface *surface;
+	boolean direct;
+};
+
+static void
+nvfx_compatible_transfer_tex(struct pipe_resource *pt, unsigned width, unsigned height,
+			     unsigned bind,
+                             struct pipe_resource *template)
+{
+	memset(template, 0, sizeof(struct pipe_resource));
+	template->target = pt->target;
+	template->format = pt->format;
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
+	template->last_level = 0;
+	template->nr_samples = pt->nr_samples;
+	template->bind = bind;
+	template->_usage = PIPE_USAGE_DYNAMIC;
+	template->flags = NVFX_RESOURCE_FLAG_LINEAR;
+}
+
+
+static unsigned nvfx_transfer_bind_flags( unsigned transfer_usage )
+{
+	unsigned bind = 0;
+
+	if (transfer_usage & PIPE_TRANSFER_WRITE)
+		bind |= PIPE_BIND_BLIT_SOURCE;
+
+	if (transfer_usage & PIPE_TRANSFER_READ)
+		bind |= PIPE_BIND_BLIT_DESTINATION;
+
+	return bind;
+}
+
+struct pipe_transfer *
+nvfx_miptree_transfer_new(struct pipe_context *pipe,
+			  struct pipe_resource *pt,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box)
+{
+	struct pipe_screen *pscreen = pipe->screen;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	struct nvfx_transfer *tx;
+	struct pipe_resource tx_tex_template, *tx_tex;
+	static int no_transfer = -1;
+	unsigned bind = nvfx_transfer_bind_flags(usage);
+	if(no_transfer < 0)
+		no_transfer = debug_get_bool_option("NOUVEAU_NO_TRANSFER", FALSE);
+
+
+	tx = CALLOC_STRUCT(nvfx_transfer);
+	if (!tx)
+		return NULL;
+
+	/* Don't handle 3D transfers yet.
+	 */
+	assert(box->depth == 1);
+
+	pipe_resource_reference(&tx->base.resource, pt);
+	tx->base.sr = sr;
+	tx->base.usage = usage;
+	tx->base.box = *box;
+	tx->base.stride = mt->level[sr.level].pitch;
+
+	/* Direct access to texture */
+	if ((pt->_usage == PIPE_USAGE_DYNAMIC ||
+	     no_transfer) &&
+	    pt->flags & NVFX_RESOURCE_FLAG_LINEAR)
+	{
+		tx->direct = true;
+
+		/* XXX: just call the internal nvfx function.  
+		 */
+		tx->surface = pscreen->get_tex_surface(pscreen, pt,
+	                                               sr.face, sr.level,
+						       box->z,
+	                                               bind);
+		return &tx->base;
+	}
+
+	tx->direct = false;
+
+	nvfx_compatible_transfer_tex(pt, box->width, box->height, bind, &tx_tex_template);
+
+	tx_tex = pscreen->resource_create(pscreen, &tx_tex_template);
+	if (!tx_tex)
+	{
+		FREE(tx);
+		return NULL;
+	}
+
+	tx->base.stride = ((struct nvfx_miptree*)tx_tex)->level[0].pitch;
+
+	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
+	                                       0, 0, 0,
+	                                       bind);
+
+	pipe_resource_reference(&tx_tex, NULL);
+
+	if (!tx->surface)
+	{
+		pipe_surface_reference(&tx->surface, NULL);
+		FREE(tx);
+		return NULL;
+	}
+
+	if (usage & PIPE_TRANSFER_READ) {
+		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
+		struct pipe_surface *src;
+
+		src = pscreen->get_tex_surface(pscreen, pt,
+	                                       sr.face, sr.level, box->z,
+	                                       PIPE_BIND_BLIT_SOURCE);
+
+		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
+		/* TODO: Check if SIFM can un-swizzle */
+		nvscreen->eng2d->copy(nvscreen->eng2d,
+		                      tx->surface, 0, 0,
+		                      src,
+				      box->x, box->y,
+		                      box->width, box->height);
+
+		pipe_surface_reference(&src, NULL);
+	}
+
+	return &tx->base;
+}
+
+void
+nvfx_miptree_transfer_del(struct pipe_context *pipe,
+			  struct pipe_transfer *ptx)
+{
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
+		struct pipe_screen *pscreen = pipe->screen;
+		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
+		struct pipe_surface *dst;
+
+		dst = pscreen->get_tex_surface(pscreen,
+					       ptx->resource,
+	                                       ptx->sr.face,
+					       ptx->sr.level,
+					       ptx->box.z,
+	                                       PIPE_BIND_BLIT_DESTINATION);
+
+		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
+		nvscreen->eng2d->copy(nvscreen->eng2d,
+		                      dst, ptx->box.x, ptx->box.y,
+		                      tx->surface, 0, 0,
+		                      ptx->box.width, ptx->box.height);
+
+		pipe_surface_reference(&dst, NULL);
+	}
+
+	pipe_surface_reference(&tx->surface, NULL);
+	pipe_resource_reference(&ptx->resource, NULL);
+	FREE(ptx);
+}
+
+void *
+nvfx_miptree_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	struct pipe_screen *pscreen = pipe->screen;
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
+	uint8_t *map = nouveau_screen_bo_map(pscreen, mt->base.bo,
+					     nouveau_screen_transfer_flags(ptx->usage));
+
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return (map + ns->base.offset + 
+			ptx->box.y * ns->pitch + 
+			ptx->box.x * util_format_get_blocksize(ptx->resource->format));
+}
+
+void
+nvfx_miptree_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	struct pipe_screen *pscreen = pipe->screen;
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
+
+	nouveau_screen_bo_unmap(pscreen, mt->base.bo);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.h b/src/gallium/drivers/nvfx/nvfx_transfer.h
new file mode 100644
index 0000000000..3e3317b2c7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.h
@@ -0,0 +1,26 @@
+
+#ifndef NVFX_TRANSFER_H
+#define NVFX_TRANSFER_H
+
+#include "util/u_transfer.h"
+#include "pipe/p_state.h"
+
+
+struct pipe_transfer *
+nvfx_miptree_transfer_new(struct pipe_context *pcontext,
+			  struct pipe_resource *pt,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box);
+void
+nvfx_miptree_transfer_del(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx);
+void *
+nvfx_miptree_transfer_map(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx);
+void
+nvfx_miptree_transfer_unmap(struct pipe_context *pcontext,
+			    struct pipe_transfer *ptx);
+
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
new file mode 100644
index 0000000000..b8e94885f0
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -0,0 +1,621 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "nvfx_resource.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+static INLINE int
+nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R16_FLOAT:
+	case PIPE_FORMAT_R16G16_FLOAT:
+	case PIPE_FORMAT_R16G16B16_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_HALF;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_resource *ib,
+		    unsigned ib_size)
+{
+	unsigned type;
+
+	if (!ib) {
+		nvfx->idxbuf = NULL;
+		nvfx->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!nvfx->screen->index_buffer_reloc_flags || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nvfx->idxbuf ||
+	    type != nvfx->idxbuf_format) {
+		nvfx->dirty |= NVFX_NEW_ARRAYS;
+		nvfx->idxbuf = ib;
+		nvfx->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+// type must be floating point
+static inline void
+nvfx_vbo_static_attrib(struct nvfx_context *nvfx,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb, unsigned ncomp)
+{
+	struct pipe_transfer *transfer;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	void *map;
+	float *v;
+
+	map  = pipe_buffer_map(&nvfx->pipe, vb->buffer, PIPE_TRANSFER_READ, &transfer);
+	map = (uint8_t *) map + vb->buffer_offset + ve->src_offset;
+
+	v = map;
+
+	switch (ncomp) {
+	case 4:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
+		OUT_RING(chan, fui(v[0]));
+		OUT_RING(chan, fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		OUT_RING(chan,  fui(v[3]));
+		break;
+	case 3:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		break;
+	case 2:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		break;
+	case 1:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
+		OUT_RING(chan,  fui(v[0]));
+		break;
+	}
+
+	pipe_buffer_unmap(&nvfx->pipe, vb->buffer, transfer);
+}
+
+void
+nvfx_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	unsigned restart = 0;
+
+	nvfx_vbo_set_idxbuf(nvfx, NULL, 0);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
+		nvfx_draw_elements_swtnl(pipe, NULL, 0,
+                                           mode, start, count);
+                return;
+	}
+
+	while (count) {
+		unsigned vc, nr, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_VERTEX_BATCH, 1));
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_VERTEX_BATCH, push));
+			while (push--) {
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+}
+
+static INLINE void
+nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart = 0, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		if (vc & 1) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+			OUT_RING  (chan, elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+			for (i = 0; i < push; i+=2)
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart = 0, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		if (vc & 1) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+			OUT_RING  (chan, elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+			for (i = 0; i < push; i+=2)
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart = 0, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
+			OUT_RINGp    (chan, elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static void
+nvfx_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_resource *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct pipe_transfer *transfer;
+	void *map;
+
+	map = pipe_buffer_map(pipe, ib, PIPE_TRANSFER_READ, &transfer);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nvfx_draw_elements_u08(nvfx, map, mode, start, count);
+		break;
+	case 2:
+		nvfx_draw_elements_u16(nvfx, map, mode, start, count);
+		break;
+	case 4:
+		nvfx_draw_elements_u32(nvfx, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	pipe_buffer_unmap(pipe, ib, transfer);
+}
+
+static void
+nvfx_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	unsigned restart = 0;
+
+	while (count) {
+		unsigned nr, vc, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_INDEX_BATCH, 1));
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_INDEX_BATCH, push));
+			while (push--) {
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		count -= vc;
+		start = restart;
+	}
+}
+
+void
+nvfx_draw_elements(struct pipe_context *pipe,
+		   struct pipe_resource *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
+		nvfx_draw_elements_swtnl(pipe, indexBuffer, indexSize,
+                                           mode, start, count);
+		return;
+	}
+
+	if (idxbuf) {
+		nvfx_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nvfx_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+}
+
+boolean
+nvfx_vbo_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct pipe_resource *ib = nvfx->idxbuf;
+	unsigned ib_format = nvfx->idxbuf_format;
+	int i;
+	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
+	uint32_t vtxfmt[16];
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
+
+	if (!elements)
+		return TRUE;
+
+	nvfx->vbo_bo = 0;
+
+	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
+	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nvfx->vtxelt->pipe[i];
+		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+
+		if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			MARK_UNDO(chan);
+			nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS;
+			return FALSE;
+		}
+
+		if (!vb->stride && type == NV34TCL_VTXFMT_TYPE_FLOAT) {
+			nvfx_vbo_static_attrib(nvfx, i, ve, vb, ncomp);
+			vtxfmt[i] = type;
+		} else {
+			vtxfmt[i] = ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
+				(ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type);
+			nvfx->vbo_bo |= (1 << i);
+		}
+	}
+
+	for(; i < elements; ++i)
+		vtxfmt[i] = NV34TCL_VTXFMT_TYPE_FLOAT;
+
+	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
+	OUT_RINGp(chan, vtxfmt, elements);
+
+	if(nvfx->is_nv4x) {
+		unsigned i;
+		/* seems to be some kind of cache flushing */
+		for(i = 0; i < 3; ++i) {
+			OUT_RING(chan, RING_3D(0x1718, 1));
+			OUT_RING(chan, 0);
+		}
+	}
+
+	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
+	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+
+		ve = &nvfx->vtxelt->pipe[i];
+		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+
+		if (!(nvfx->vbo_bo & (1 << i)))
+			OUT_RING(chan, 0);
+		else
+		{
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+			OUT_RELOC(chan, bo,
+				 vb->buffer_offset + ve->src_offset,
+				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		}
+	}
+
+        for (; i < elements; i++)
+		OUT_RING(chan, 0);
+
+	OUT_RING(chan, RING_3D(0x1710, 1));
+	OUT_RING(chan, 0);
+
+	if (ib) {
+		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
+		struct nouveau_bo* bo = nvfx_resource(ib)->bo;
+
+		assert(nvfx->screen->index_buffer_reloc_flags);
+
+		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
+		OUT_RELOC(chan, bo, 0, ib_flags | NOUVEAU_BO_LOW, 0, 0);
+		OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
+				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
+	return TRUE;
+}
+
+void
+nvfx_vbo_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
+	int i;
+
+	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
+	for(i = 0; i < nvfx->vtxelt->num_elements; ++i) {
+		if(nvfx->vbo_bo & (1 << i)) {
+			struct pipe_vertex_element *ve = &nvfx->vtxelt->pipe[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+			OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(i), 1),
+					vb_flags, 0, 0);
+			OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset,
+					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		}
+	}
+
+	if(nvfx->idxbuf)
+	{
+		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
+		struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf)->bo;
+
+		assert(nvfx->screen->index_buffer_reloc_flags);
+
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2),
+				ib_flags, 0, 0);
+		OUT_RELOC(chan, bo, 0,
+				ib_flags | NOUVEAU_BO_LOW, 0, 0);
+		OUT_RELOC(chan, bo, nvfx->idxbuf_format,
+				ib_flags | NOUVEAU_BO_OR,
+				0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
new file mode 100644
index 0000000000..80b98b62d3
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -0,0 +1,1066 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#include "nv30_vertprog.h"
+#include "nv40_vertprog.h"
+
+#define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nvfx_vpc {
+	struct nvfx_vertex_program *vp;
+
+	struct nvfx_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_sreg *r_address;
+	struct nvfx_sreg *r_temp;
+
+	struct nvfx_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nvfx_sreg
+temp(struct nvfx_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nvfx_sr(NVFXSR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nvfx_sr(NVFXSR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nvfx_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nvfx_sreg
+constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	struct nvfx_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nvfx_sr(NVFXSR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nvfx_sr(NVFXSR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nvfx_vp_arith(nvfx, (cc), NVFX_VP_INST_SLOT_##s, NVFX_VP_INST_##s##_OP_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_sreg src)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NVFXSR_TEMP:
+		sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
+		sr |= (src.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
+		break;
+	case NVFXSR_INPUT:
+		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
+		break;
+	case NVFXSR_CONST:
+		sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NVFXSR_NONE:
+		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NVFX_VP(SRC_NEGATE);
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NVFX_VP(SRC_SWZ_X_SHIFT)) |
+	       (src.swz[1] << NVFX_VP(SRC_SWZ_Y_SHIFT)) |
+	       (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
+	       (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
+			  NVFX_VP(SRC0_HIGH_SHIFT)) << NVFX_VP(INST_SRC0H_SHIFT);
+		hw[2] |= (sr & NVFX_VP(SRC0_LOW_MASK)) <<
+			  NVFX_VP(INST_SRC0L_SHIFT);
+		break;
+	case 1:
+		hw[2] |= sr << NVFX_VP(INST_SRC1_SHIFT);
+		break;
+	case 2:
+		hw[2] |= ((sr & NVFX_VP(SRC2_HIGH_MASK)) >>
+			  NVFX_VP(SRC2_HIGH_SHIFT)) << NVFX_VP(INST_SRC2H_SHIFT);
+		hw[3] |= (sr & NVFX_VP(SRC2_LOW_MASK)) <<
+			  NVFX_VP(INST_SRC2L_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_sreg dst)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NVFXSR_TEMP:
+		if(!nvfx->is_nv4x)
+			hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		else {
+			hw[3] |= NV40_VP_INST_DEST_MASK;
+			if (slot == 0) {
+				hw[0] |= (dst.index <<
+					  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+			} else {
+				hw[3] |= (dst.index <<
+					  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+			}
+		}
+		break;
+	case NVFXSR_OUTPUT:
+		/* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
+		switch (dst.index) {
+		case NVFX_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		default:
+			if(!nvfx->is_nv4x) {
+				switch (dst.index) {
+				case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+				case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+				case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+				case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+				case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+				case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+				case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+				case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+				case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+				case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+				case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+				case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+				case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+				case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+				}
+			} else {
+				switch (dst.index) {
+				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+				case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+				case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+				case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+				case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+				case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+				case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+				case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+				case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+				case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+				}
+			}
+			break;
+		}
+
+		if(!nvfx->is_nv4x) {
+			hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+			/*XXX: no way this is entirely correct, someone needs to
+			 *     figure out what exactly it is.
+			 */
+			hw[3] |= 0x800;
+		} else {
+			hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+			if (slot == 0) {
+				hw[0] |= NV40_VP_INST_VEC_RESULT;
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+			} else {
+				hw[3] |= NV40_VP_INST_SCA_RESULT;
+				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+			}
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op,
+	      struct nvfx_sreg dst, int mask,
+	      struct nvfx_sreg s0, struct nvfx_sreg s1,
+	      struct nvfx_sreg s2)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NVFX_COND_TR << NVFX_VP(INST_COND_SHIFT));
+	hw[0] |= ((0 << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
+		  (1 << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
+		  (2 << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
+		  (3 << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+
+	if(!nvfx->is_nv4x) {
+		hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//		hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
+//		hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
+
+		if (dst.type == NVFXSR_OUTPUT) {
+			if (slot)
+				hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+			else
+				hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+		} else {
+			if (slot)
+				hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+			else
+				hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+		}
+	 } else {
+		if (slot == 0) {
+			hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+			hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	    } else {
+			hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+			hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+			hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+		}
+	}
+
+	emit_dst(nvfx, vpc, hw, slot, dst);
+	emit_src(nvfx, vpc, hw, 0, s0);
+	emit_src(nvfx, vpc, hw, 1, s1);
+	emit_src(nvfx, vpc, hw, 2, s2);
+}
+
+static INLINE struct nvfx_sreg
+tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nvfx_sreg src = { 0 };
+
+	switch (fsrc->Register.File) {
+	case TGSI_FILE_INPUT:
+		src = nvfx_sr(NVFXSR_INPUT, fsrc->Register.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nvfx_sreg dst = { 0 };
+
+	switch (fdst->Register.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->Register.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_VP_MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_VP_MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_VP_MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_VP_MASK_W;
+	return mask;
+}
+
+static boolean
+nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nvfx_sreg src[3], dst, tmp;
+	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+
+		switch (fsrc->Register.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, VEC, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, VEC, ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, VEC, ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(vpc, SCA, COS, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, VEC, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, VEC, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, VEC, DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, VEC, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, SCA, EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, SCA, EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, VEC, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, VEC, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, SCA, LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, SCA, LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, SCA, LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LRP:
+		tmp = temp(vpc);
+		arith(vpc, VEC, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, VEC, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, VEC, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, VEC, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, VEC, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, SCA, LG2, tmp, NVFX_VP_MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, VEC, MUL, tmp, NVFX_VP_MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, SCA, EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, SCA, RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, SCA, RSQ, dst, mask, none, none, abs(src[0]));
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(vpc, VEC, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(vpc, VEC, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, VEC, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, VEC, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(vpc, SCA, SIN, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(vpc, VEC, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, VEC, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(vpc, VEC, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SSG:
+		arith(vpc, VEC, SSG, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(vpc, VEC, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, VEC, ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, VEC, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->Range.First;
+	int hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NVFX_VP(INST_DEST_POS);
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_VP(INST_DEST_COL0);
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_VP(INST_DEST_COL1);
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_VP(INST_DEST_BFC0);
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_VP(INST_DEST_BFC1);
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NVFX_VP(INST_DEST_FOGC);
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NVFX_VP(INST_DEST_PSZ);
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.Index <= 7) {
+			hw = NVFX_VP(INST_DEST_TC(fdec->Semantic.Index));
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_EDGEFLAG:
+		/* not really an error just a fallback */
+		NOUVEAU_ERR("cannot handle edgeflag output\n");
+		return FALSE;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->Range.Last > high_temp) {
+					high_temp =
+						fdec->Range.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->Range.Last > high_addr) {
+					high_addr =
+						fdec->Range.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nvfx_vertprog_parse_decl_output(nvfx, vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->Dst[0];
+
+			if (fdst->Register.File == TGSI_FILE_ADDRESS) {
+				if (fdst->Register.Index > high_addr)
+					high_addr = fdst->Register.Index;
+			}
+
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nvfx_vertprog_translate(struct nvfx_context *nvfx,
+			struct nvfx_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nvfx_vpc *vpc = NULL;
+	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nvfx_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nvfx_vertprog_prepare(nvfx, vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code to the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u[0].Float,
+					 imm->u[1].Float,
+					 imm->u[2].Float,
+					 imm->u[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
+		struct nvfx_sreg hpos = nvfx_sr(NVFXSR_OUTPUT,
+						NVFX_VP(INST_DEST_POS));
+		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nvfx_sreg cdst = nvfx_sr(NVFXSR_OUTPUT,
+						NVFX_VP_INST_DEST_CLIP(i));
+		struct nvfx_sreg ceqn = constant(vpc, -1,
+						 nvfx->clip.ucp[i][0],
+						 nvfx->clip.ucp[i][1],
+						 nvfx->clip.ucp[i][2],
+						 nvfx->clip.ucp[i][3]);
+		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = NVFX_VP_MASK_Y; break;
+		case 1: case 4: mask = NVFX_VP_MASK_Z; break;
+		case 2: case 5: mask = NVFX_VP_MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, VEC, DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp);
+	if (vpc->r_address)
+		FREE(vpc->r_address);
+	if (vpc->imm)
+		FREE(vpc->imm);
+	FREE(vpc);
+}
+
+boolean
+nvfx_vertprog_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_context *pipe = &nvfx->pipe;
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	struct nvfx_vertex_program *vp;
+	struct pipe_resource *constbuf;
+	struct pipe_transfer *transfer = NULL;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nvfx->render_mode == HW) {
+		vp = nvfx->vertprog;
+		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
+
+		// TODO: ouch! can't we just use constant slots for these?!
+		if ((nvfx->dirty & NVFX_NEW_UCP) ||
+		    memcmp(&nvfx->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nvfx_vertprog_destroy(nvfx, vp);
+			memcpy(&vp->ucp, &nvfx->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nvfx->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated)
+	{
+		nvfx->fallback_swtnl &= ~NVFX_NEW_VERTPROG;
+		nvfx_vertprog_translate(nvfx, vp);
+		if (!vp->translated) {
+			nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
+			return FALSE;
+		}
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nvfx->screen->vp_exec_heap;
+		uint vplen = vp->nr_insns;
+
+		if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nvfx_vertex_program *evict;
+
+				evict = heap->next->priv;
+				nouveau_resource_free(&evict->exec);
+			}
+
+			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nvfx->screen->vp_data_heap;
+
+		if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nvfx_vertex_program *evict;
+
+				evict = heap->next->priv;
+				nouveau_resource_free(&evict->data);
+			}
+
+			if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NVFX_VP(INST_CONST_SRC_SHIFT);
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = pipe_buffer_map(pipe, constbuf,
+					      PIPE_TRANSFER_READ,
+					      &transfer);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			pipe_buffer_unmap(pipe, constbuf, transfer);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
+		}
+	}
+
+	if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
+	{
+		WAIT_RING(chan, 7);
+		OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
+		OUT_RING(chan, vp->exec->start);
+		if(nvfx->is_nv4x) {
+			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2));
+			OUT_RING(chan, vp->ir);
+			OUT_RING(chan, vp->or);
+		}
+		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+		OUT_RING(chan, vp->clip_ctrl);
+	}
+
+	return TRUE;
+}
+
+void
+nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
+{
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nouveau_resource_free(&vp->exec);
+	vp->exec_start = 0;
+	nouveau_resource_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+}