nouveau: Very rough cut at gallium winsys + nv40 pipe driver.

author: Ben Skeggs <skeggsb@gmail.com> 2007-11-18 17:08:06 +1100
committer: Ben Skeggs <skeggsb@gmail.com> 2007-11-18 17:34:06 +1100
commit: 2f33b5b56e9221f2613b34cd1a1a9d82d5ed4303 (patch)
tree: 9bcdd27b60eaf4c3d608b4dd2f582fcee7c39f11 /src/mesa/pipe/nv40
parent: 193c85ec7a1aec44eebc67c6224fb6ecbb4607a5 (diff)
19 files changed, 4337 insertions, 0 deletions
diff --git a/src/mesa/pipe/nv40/Makefile b/src/mesa/pipe/nv40/Makefile
new file mode 100644
index 0000000000..90c8542da4
--- /dev/null
+++ b/src/mesa/pipe/nv40/Makefile
@@ -0,0 +1,30 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv40
+
+DRIVER_SOURCES = \
+	nv40_clear.c \
+	nv40_context.c \
+	nv40_draw.c \
+	nv40_fragprog.c \
+	nv40_miptree.c \
+	nv40_query.c \
+	nv40_region.c \
+	nv40_state.c \
+	nv40_state_emit.c \
+	nv40_state_tex.c \
+	nv40_surface.c \
+	nv40_vbo.c \
+	nv40_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../Makefile.template
+
+symlinks:
+
diff --git a/src/mesa/pipe/nv40/nv40_clear.c b/src/mesa/pipe/nv40/nv40_clear.c
new file mode 100644
index 0000000000..f3b7a23689
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_clear.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+
+
+void
+nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	/*XXX: We're actually Z24_S8... */
+	if (ps->format == PIPE_FORMAT_S8_Z24) {
+		clearValue = (((clearValue & 0xff000000) >> 24) |
+			      ((clearValue & 0x00ffffff) <<  8));
+	}
+
+	pipe->region_fill(pipe, ps->region, 0, 0, 0, ps->width, ps->height,
+			  clearValue);
+}
diff --git a/src/mesa/pipe/nv40/nv40_context.c b/src/mesa/pipe/nv40/nv40_context.c
new file mode 100644
index 0000000000..ff66095c5f
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_context.c
@@ -0,0 +1,277 @@
+#include "pipe/draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_util.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+
+static boolean
+nv40_is_format_supported(struct pipe_context *pipe, uint format)
+{
+	switch (format) {
+	case PIPE_FORMAT_U_A8_R8_G8_B8:
+	case PIPE_FORMAT_U_R5_G6_B5: 
+	case PIPE_FORMAT_S8_Z24: 
+		return TRUE;
+	default:
+		break;
+	};
+
+	return FALSE;
+}
+
+static const char *
+nv40_get_name(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", nv40->chipset);
+	return buffer;
+}
+
+static const char *
+nv40_get_vendor(struct pipe_context *pipe)
+{
+	return "nouveau";
+}
+
+static int
+nv40_get_param(struct pipe_context *pipe, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 0;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 4;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv40_get_paramf(struct pipe_context *pipe, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static void
+nv40_flush(struct pipe_context *pipe, unsigned flags)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nouveau_winsys *nvws = nv40->nvws;
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	if (flags & PIPE_FLUSH_WAIT) {
+		nvws->notifier_reset(nv40->sync, 0);
+		BEGIN_RING(curie, 0x104, 1);
+		OUT_RING  (0);
+		BEGIN_RING(curie, 0x100, 1);
+		OUT_RING  (0);
+	}
+
+	FIRE_RING();
+
+	if (flags & PIPE_FLUSH_WAIT)
+		nvws->notifier_wait(nv40->sync, 0, 0, 2000);
+}
+
+static void
+nv40_destroy(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	draw_destroy(nv40->draw);
+	free(nv40);
+}
+
+static boolean
+nv40_init_hwctx(struct nv40_context *nv40, int curie_class)
+{
+	struct nouveau_winsys *nvws = nv40->nvws;
+	int ret;
+
+	if ((ret = nvws->notifier_alloc(nvws, nv40->num_query_objects,
+					&nv40->query))) {
+		NOUVEAU_ERR("Error creating query notifier objects: %d\n", ret);
+		return FALSE;
+	}
+
+	if ((ret = nvws->grobj_alloc(nvws, curie_class,
+				     &nv40->curie))) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	BEGIN_RING(curie, NV40TCL_DMA_NOTIFY, 1);
+	OUT_RING  (nv40->sync->handle);
+	BEGIN_RING(curie, NV40TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
+	OUT_RING  (nvws->channel->vram->handle);
+	BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+	BEGIN_RING(curie, NV40TCL_DMA_VTXBUF0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(curie, NV40TCL_DMA_FENCE, 2);
+	OUT_RING  (0);
+	OUT_RING  (nv40->query->handle);
+	BEGIN_RING(curie, NV40TCL_DMA_UNK01AC, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+	BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+
+	BEGIN_RING(curie, 0x1ea4, 3);
+	OUT_RING  (0x00000010);
+	OUT_RING  (0x01000100);
+	OUT_RING  (0xff800006);
+
+	/* vtxprog output routing */
+	BEGIN_RING(curie, 0x1fc4, 1);
+	OUT_RING  (0x06144321);
+	BEGIN_RING(curie, 0x1fc8, 2);
+	OUT_RING  (0xedcba987);
+	OUT_RING  (0x00000021);
+	BEGIN_RING(curie, 0x1fd0, 1);
+	OUT_RING  (0x00171615);
+	BEGIN_RING(curie, 0x1fd4, 1);
+	OUT_RING  (0x001b1a19);
+
+	BEGIN_RING(curie, 0x1ef8, 1);
+	OUT_RING  (0x0020ffff);
+	BEGIN_RING(curie, 0x1d64, 1);
+	OUT_RING  (0x00d30000);
+	BEGIN_RING(curie, 0x1e94, 1);
+	OUT_RING  (0x00000001);
+
+	FIRE_RING ();
+	return TRUE;
+}
+
+#define GRCLASS4097_CHIPSETS 0x00000baf
+#define GRCLASS4497_CHIPSETS 0x00005450
+struct pipe_context *
+nv40_create(struct pipe_winsys *pipe_winsys, struct nouveau_winsys *nvws,
+	    unsigned chipset)
+{
+	struct nv40_context *nv40;
+	int curie_class, ret;
+
+	if ((chipset & 0xf0) != 0x40) {
+		NOUVEAU_ERR("Not a NV4X chipset\n");
+		return NULL;
+	}
+
+	if (GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f))) {
+		curie_class = 0x4097;
+	} else
+	if (GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f))) {
+		curie_class = 0x4497;
+	} else {
+		NOUVEAU_ERR("Unknown NV4X chipset: NV%02x\n", chipset);
+		return NULL;
+	}
+
+	nv40 = CALLOC_STRUCT(nv40_context);
+	if (!nv40)
+		return NULL;
+	nv40->chipset = chipset;
+	nv40->nvws = nvws;
+
+	if ((ret = nvws->notifier_alloc(nvws, 1, &nv40->sync))) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		free(nv40);
+		return NULL;
+	}
+
+	nv40->num_query_objects = 32;
+	nv40->query_objects = calloc(nv40->num_query_objects,
+				     sizeof(struct pipe_query_object *));
+	if (!nv40->query_objects) {
+		free(nv40);
+		return NULL;
+	}
+
+	if (!nv40_init_hwctx(nv40, curie_class)) {
+		free(nv40);
+		return NULL;
+	}
+
+	nv40->pipe.winsys = pipe_winsys;
+
+	nv40->pipe.destroy = nv40_destroy;
+	nv40->pipe.is_format_supported = nv40_is_format_supported;
+	nv40->pipe.get_name = nv40_get_name;
+	nv40->pipe.get_vendor = nv40_get_vendor;
+	nv40->pipe.get_param = nv40_get_param;
+	nv40->pipe.get_paramf = nv40_get_paramf;
+
+	nv40->pipe.draw_arrays = nv40_draw_arrays;
+	nv40->pipe.draw_elements = nv40_draw_elements;
+	nv40->pipe.clear = nv40_clear;
+
+	nv40->pipe.begin_query = nv40_query_begin;
+	nv40->pipe.end_query = nv40_query_end;
+	nv40->pipe.wait_query = nv40_query_wait;
+
+	nv40->pipe.mipmap_tree_layout = nv40_miptree_layout;
+
+	nv40->pipe.flush = nv40_flush;
+
+	nv40_init_region_functions(nv40);
+	nv40_init_surface_functions(nv40);
+	nv40_init_state_functions(nv40);
+
+	nv40->draw = draw_create();
+	assert(nv40->draw);
+	draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40));
+
+	return &nv40->pipe;
+}
+
+		
diff --git a/src/mesa/pipe/nv40/nv40_context.h b/src/mesa/pipe/nv40/nv40_context.h
new file mode 100644
index 0000000000..63be38299f
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_context.h
@@ -0,0 +1,111 @@
+#ifndef __NV40_CONTEXT_H__
+#define __NV40_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/draw/draw_vertex.h"
+
+#include "pipe/nouveau/nouveau_winsys.h"
+
+#include "nv40_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV40_NEW_TEXTURE	(1 << 0)
+#define NV40_NEW_VERTPROG	(1 << 1)
+#define NV40_NEW_FRAGPROG	(1 << 2)
+#define NV40_NEW_ARRAYS		(1 << 3)
+
+struct nv40_context {
+	struct pipe_context pipe;
+	struct nouveau_winsys *nvws;
+
+	struct draw_context *draw;
+
+	int chipset;
+	struct nouveau_grobj *curie;
+	struct nouveau_notifier *sync;
+	uint32_t *pushbuf;
+
+	/* query objects */
+	struct nouveau_notifier *query;
+	struct pipe_query_object **query_objects;
+	uint num_query_objects;
+
+	uint32_t dirty;
+
+	struct nv40_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct pipe_mipmap_tree   *tex_miptree[PIPE_MAX_SAMPLERS];
+	uint32_t                   tex_dirty;
+
+	struct {
+		struct nv40_vertex_program *vp;
+		struct nv40_vertex_program *active_vp;
+
+		struct pipe_buffer_handle *constant_buf;
+	} vertprog;
+
+	struct {
+		struct nv40_fragment_program *fp;
+		struct nv40_fragment_program *active_fp;
+
+		struct pipe_buffer_handle *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_ATTRIB_MAX];
+	struct pipe_vertex_element vtxelt[PIPE_ATTRIB_MAX];
+};
+
+
+extern void nv40_init_region_functions(struct nv40_context *nv40);
+extern void nv40_init_surface_functions(struct nv40_context *nv40);
+extern void nv40_init_state_functions(struct nv40_context *nv40);
+
+/* nv40_draw.c */
+extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
+
+/* nv40_miptree.c */
+extern boolean nv40_miptree_layout(struct pipe_context *,
+				   struct pipe_mipmap_tree *);
+
+/* nv40_vertprog.c */
+extern void nv40_vertprog_translate(struct nv40_context *,
+				    struct nv40_vertex_program *);
+extern void nv40_vertprog_bind(struct nv40_context *,
+			       struct nv40_vertex_program *);
+
+/* nv40_fragprog.c */
+extern void nv40_fragprog_translate(struct nv40_context *,
+				    struct nv40_fragment_program *);
+extern void nv40_fragprog_bind(struct nv40_context *,
+			       struct nv40_fragment_program *);
+
+/* nv40_state.c and friends */
+extern void nv40_emit_hw_state(struct nv40_context *nv40);
+extern void nv40_state_tex_update(struct nv40_context *nv40);
+
+/* nv40_vbo.c */
+extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv40_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer_handle *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+extern void nv40_vbo_arrays_update(struct nv40_context *nv40);
+
+/* nv40_clear.c */
+extern void nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv40_query.c */
+extern void nv40_query_begin(struct pipe_context *, struct pipe_query_object *);
+extern void nv40_query_end(struct pipe_context *, struct pipe_query_object *);
+extern void nv40_query_wait(struct pipe_context *, struct pipe_query_object *);
+
+#endif
diff --git a/src/mesa/pipe/nv40/nv40_dma.h b/src/mesa/pipe/nv40/nv40_dma.h
new file mode 100644
index 0000000000..3775ce6e72
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_dma.h
@@ -0,0 +1,62 @@
+#ifndef __NV40_DMA_H__
+#define __NV40_DMA_H__
+
+#include "pipe/nouveau/nouveau_winsys.h"
+
+#define BEGIN_RING(obj,mthd,size) do {                                         \
+	nv40->pushbuf = nv40->nvws->begin_ring(nv40->obj, (mthd), (size));     \
+} while(0)
+
+#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
+	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
+} while(0)
+
+#define OUT_RING(data) do {                                                    \
+	(*nv40->pushbuf++) = (data);                                           \
+} while(0)
+
+#define OUT_RINGp(src,size) do {                                               \
+	memcpy(nv40->pushbuf, (src), (size) * 4);                              \
+	nv40->pushbuf += (size);                                               \
+} while(0)
+
+#define OUT_RINGf(data) do {                                                   \
+	union { float v; uint32_t u; } c;                                      \
+	c.v = (data);                                                          \
+	OUT_RING(c.u);                                                         \
+} while(0)
+
+#define FIRE_RING() do {                                                       \
+	nv40->nvws->fire_ring(nv40->nvws->channel);                            \
+} while(0)
+
+#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
+	nv40->nvws->out_reloc(nv40->nvws->channel, nv40->pushbuf,              \
+			      (struct nouveau_bo *)(bo),                       \
+			      (data), (flags), (vor), (tor));                  \
+	OUT_RING(0);                                                           \
+} while(0)
+
+/* Raw data + flags depending on FB/TT buffer */
+#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
+	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
+} while(0)
+
+/* FB/TT object handle */
+#define OUT_RELOCo(bo,flags) do {                                              \
+	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
+		  nv40->nvws->channel->vram->handle,                           \
+		  nv40->nvws->channel->gart->handle);                          \
+} while(0)
+
+/* Low 32-bits of offset */
+#define OUT_RELOCl(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
+} while(0)
+
+/* High 32-bits of offset */
+#define OUT_RELOCh(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
+} while(0)
+
+#endif
diff --git a/src/mesa/pipe/nv40/nv40_draw.c b/src/mesa/pipe/nv40/nv40_draw.c
new file mode 100644
index 0000000000..52ce493ea2
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_draw.c
@@ -0,0 +1,63 @@
+#include "pipe/draw/draw_private.h"
+#include "pipe/p_util.h"
+
+#include "nv40_context.h"
+
+struct nv40_draw_stage {
+	struct draw_stage draw;
+	struct nv40_context *nv40;
+};
+
+static void
+nv40_draw_begin(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv40_draw_end(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv40_draw_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv40_draw_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv40_draw_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv40_draw_reset_stipple_counter(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+struct draw_stage *
+nv40_draw_render_stage(struct nv40_context *nv40)
+{
+	struct nv40_draw_stage *nv40draw = CALLOC_STRUCT(nv40_draw_stage);
+
+	nv40draw->nv40 = nv40;
+	nv40draw->draw.draw = nv40->draw;
+	nv40draw->draw.begin = nv40_draw_begin;
+	nv40draw->draw.point = nv40_draw_point;
+	nv40draw->draw.line = nv40_draw_line;
+	nv40draw->draw.tri = nv40_draw_tri;
+	nv40draw->draw.end = nv40_draw_end;
+	nv40draw->draw.reset_stipple_counter = nv40_draw_reset_stipple_counter;
+
+	return &nv40draw->draw;
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
new file mode 100644
index 0000000000..48b783eebe
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -0,0 +1,642 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/tgsi/exec/tgsi_token.h"
+#include "pipe/tgsi/exec/tgsi_parse.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV40_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV40_FP_OP_COND_TR
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+#define scale(s,v) nv40_sr_scale((s), NV40_FP_OP_DST_SCALE_##v)
+
+static uint32_t
+passthrough_fp_data[] = {
+	0x01403e81, 0x1c9dc801, 0x0001c800, 0x3fe1c800
+};
+
+static struct nv40_fragment_program
+passthrough_fp = {
+	.pipe = NULL,
+	.translated = TRUE,
+	.insn = passthrough_fp_data,
+	.insn_len = sizeof(passthrough_fp_data) / sizeof(uint32_t),
+	.buffer = NULL,
+	.uses_kil = 0,
+	.num_regs = 2,
+};
+
+struct nv40_fpc {
+	struct nv40_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	uint depth_id;
+	uint colour_id;
+
+	boolean inst_has_const;
+	int     inst_const_id;
+};
+
+static INLINE struct nv40_sreg
+nv40_sr_temp(struct nv40_fpc *fpc)
+{
+	int idx;
+
+	idx  = fpc->temp_temp_count++;
+	idx += fpc->high_temp + 1;
+	return nv40_sr(0, NV40_FP_REG_TYPE_TEMP, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_fp_arith((cc), (s), NV40_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv40_fp_tex((cc), (s), NV40_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+#define temp(fpc) nv40_sr_temp((fpc))
+
+static void
+emit_src(struct nv40_fpc *fpc, uint32_t *hw, int pos, struct nv40_sreg src)
+{
+	uint32_t sr = 0;
+
+	sr |= (src.type << NV40_FP_REG_TYPE_SHIFT);
+	if (src.type == NV40_FP_REG_TYPE_INPUT) {
+		hw[0] |= (src.index << NV40_FP_OP_INPUT_SRC_SHIFT);
+	} else
+	if (src.type == NV40_FP_REG_TYPE_CONST) {
+		fpc->inst_has_const = TRUE;
+	} else
+	if (src.type == NV40_FP_REG_TYPE_TEMP) {
+		sr |= (src.index << NV40_FP_REG_SRC_SHIFT);
+	}
+
+	if (src.negate)
+		sr |= NV40_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV40_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv40_fpc *fpc, uint32_t *hw, struct nv40_sreg dst)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	if (dst.output) {
+		if (dst.index == 1) {
+			fp->writes_depth = 1;
+		} else {
+			hw[0] |= NV40_FP_OP_UNK0_7;
+		}
+	} else {
+		if (fp->num_regs < (dst.index + 1))
+			fp->num_regs = dst.index + 1;
+	}
+	hw[0] |= (dst.index << NV40_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv40_fp_arith(struct nv40_fpc *fpc, int sat, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fp->insn_len];
+
+	fpc->inst_has_const = FALSE;
+
+	if (op == NV40_FP_OP_OPCODE_KIL)
+		fp->uses_kil = TRUE;
+	hw[0] |= (op << NV40_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV40_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV40_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV40_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV40_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV40_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV40_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV40_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV40_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV40_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, hw, dst);
+	emit_src(fpc, hw, 0, s0);
+	emit_src(fpc, hw, 1, s1);
+	emit_src(fpc, hw, 2, s2);
+
+	fp->insn_len += 4;
+	if (fpc->inst_has_const) {
+		fp->consts[fp->num_consts].pipe_id = fpc->inst_const_id;
+		fp->consts[fp->num_consts].hw_id = fp->insn_len;
+		fp->num_consts++;
+		fp->insn_len += 4;
+	}
+}
+
+static void
+nv40_fp_tex(struct nv40_fpc *fpc, int sat, int op, int unit,
+	    struct nv40_sreg dst, int mask,
+	    struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fp->insn_len];
+
+	nv40_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+	hw[0] |= (unit << NV40_FP_OP_TEX_UNIT_SHIFT);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv40_sreg src;
+	uint type, index;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		type   = NV40_FP_REG_TYPE_INPUT;
+		index  = fpc->attrib_map[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_CONSTANT:
+		type   = NV40_FP_REG_TYPE_CONST;
+		index  = fsrc->SrcRegister.Index;
+		break;
+	case TGSI_FILE_TEMPORARY:
+		type   = NV40_FP_REG_TYPE_TEMP;
+		index  = fsrc->SrcRegister.Index + 1;
+		if (fpc->high_temp < index)
+			fpc->high_temp = index;
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src = nv40_sr(0, type, index);
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	int out, idx;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		out = 1;
+		if (fdst->DstRegister.Index == fpc->colour_id)
+			idx = 0;
+		else
+			idx = 1;
+		break;
+	case TGSI_FILE_TEMPORARY:
+		out = 0;
+		idx = fdst->DstRegister.Index + 1;
+		if (fpc->high_temp < idx)
+			fpc->high_temp = idx;
+		break;
+	case TGSI_FILE_NULL:
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		break;
+	}
+
+	return nv40_sr(out, NV40_FP_REG_TYPE_TEMP, idx);
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv40_sreg src[3], dst, tmp;
+	struct nv40_sreg none = nv40_sr(0, NV40_FP_REG_TYPE_INPUT, 0);
+	int mask, sat, unit;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_RET)
+		return TRUE;
+
+	fpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				NOUVEAU_MSG("extra src attr %d\n",
+					 fsrc->SrcRegister.Index);
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_KILP:
+		/*XXX: Which is NV, which is ARB kil? ARB implemented here.
+		 *XXX: Don't need temp, can update CC0 without writing dst
+		 */
+		tmp = temp(fpc);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, MASK_ALL, src[0], none, none);
+		dst.cc_test = NV40_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		tmp = temp(fpc);
+		arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, tmp, MASK_X,
+		      swz(src[0], X, X, X, X), none, none);
+		arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(fpc, sat, EX2, dst, mask,
+		      swz(tmp, X, X, X, X), none, none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RFL:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
+		arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
+		arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
+		      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+		arith(fpc, sat, MAD, dst, mask,
+		      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		break;
+	case TGSI_OPCODE_RSQ:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
+		      abs(swz(src[0], X, X, X, X)), none, none);
+		arith(fpc, sat, EX2, dst, mask,
+		      neg(swz(tmp, X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+#if 0 /* XXX: reimplement on top of TEX */
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+#endif
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->u.DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		fpc->depth_id = fdec->u.DeclarationRange.First;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		fpc->colour_id = fdec->u.DeclarationRange.First;
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+void
+nv40_fragprog_translate(struct nv40_context *nv40,
+			struct nv40_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_fpc *fpc = NULL;
+	int i;
+
+	fpc = calloc(1, sizeof(struct nv40_fpc));
+	if (!fpc)
+		return;
+	fp->insn = calloc(1, 128*4*sizeof(uint32_t));
+	fpc->fp = fp;
+	fpc->high_temp = -1;
+	fp->num_regs = 2;
+
+	tgsi_parse_init(&parse, fp->pipe->tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv40_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (fpc->inst_has_const == FALSE)
+		fp->insn[fp->insn_len - 4] |= 0x00000001;
+	else
+		fp->insn[fp->insn_len - 8] |= 0x00000001;
+	fp->insn[fp->insn_len++]  = 0x00000001;
+
+	fp->translated = TRUE;
+	fp->on_hw = FALSE;
+out_err:
+	tgsi_parse_free(&parse);
+	free(fpc);
+}
+
+void
+nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	uint32_t fp_control;
+
+	if (!fp->translated) {
+		NOUVEAU_ERR("fragprog invalid, using passthrough shader\n");
+		fp = &passthrough_fp;
+	}
+
+	if (!fp->on_hw) {
+		if (!fp->buffer)
+			fp->buffer = ws->buffer_create(ws, 0x100);
+
+		nv40->pipe.winsys->buffer_data(nv40->pipe.winsys, fp->buffer,
+					       fp->insn_len * sizeof(uint32_t),
+					       fp->insn,
+					       PIPE_BUFFER_USAGE_PIXEL);
+		fp->on_hw = TRUE;
+	}
+
+	fp_control = fp->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+	if (fp->uses_kil)
+		fp_control |= NV40TCL_FP_CONTROL_KIL;
+	if (fp->writes_depth)
+		fp_control |= 0xe;
+
+	BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1);
+	OUT_RELOC (fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		   NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		   NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	BEGIN_RING(curie, NV40TCL_FP_CONTROL, 1);
+	OUT_RING  (fp_control);
+
+	nv40->fragprog.active_fp = fp;
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_miptree.c b/src/mesa/pipe/nv40/nv40_miptree.c
new file mode 100644
index 0000000000..6b85823d8c
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_miptree.c
@@ -0,0 +1,60 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+
+#include "nv40_context.h"
+
+boolean
+nv40_miptree_layout(struct pipe_context *pipe, struct pipe_mipmap_tree *mt)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	uint width, height, depth, offset;
+	boolean swizzled = FALSE;
+	int l;
+
+	mt->pitch = mt->width0;
+	mt->total_height = 0;
+
+	width = mt->width0;
+	height = mt->height0;
+	depth = mt->depth0;
+	offset = 0;
+	for (l = mt->first_level; l <= mt->last_level; l++) {
+		uint pitch, f;
+
+		mt->level[l].width = width;
+		mt->level[l].height = height;
+		mt->level[l].depth = depth;
+		mt->level[l].level_offset = offset;
+
+		if (!swizzled)
+			pitch = mt->width0;
+		else
+			pitch = width;
+
+		if (mt->target == PIPE_TEXTURE_CUBE)
+			mt->level[l].nr_images = 6;
+		else
+		if (mt->target == PIPE_TEXTURE_3D)
+			mt->level[l].nr_images = 3;
+		else
+			mt->level[l].nr_images = 1;
+		mt->level[l].image_offset =
+			malloc(mt->level[l].nr_images * sizeof(unsigned));
+
+		for (f = 0; f < mt->level[l].nr_images; f++) {
+			mt->level[l].image_offset[f] =
+				(offset - mt->level[l].level_offset) / mt->cpp;
+			mt->total_height += height;
+
+			offset += (pitch * mt->cpp * height);
+		}
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	return TRUE;
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_query.c b/src/mesa/pipe/nv40/nv40_query.c
new file mode 100644
index 0000000000..efd81e6640
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_query.c
@@ -0,0 +1,98 @@
+#include "pipe/p_context.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+
+static uint
+nv40_query_object_find(struct nv40_context *nv40, struct pipe_query_object *q)
+{
+	int id;
+
+	for (id = 0; id < nv40->num_query_objects; id++) {
+		if (nv40->query_objects[id] == q)
+			return id;
+	}
+
+	return -1;
+}
+
+void
+nv40_query_begin(struct pipe_context *pipe, struct pipe_query_object *q)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	int id;
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	id = nv40_query_object_find(nv40, NULL);
+	assert(id >= 0);
+	nv40->query_objects[id] = q;
+
+	nv40->nvws->notifier_reset(nv40->query, id);
+	q->ready = 0;
+
+	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+}
+
+static void
+nv40_query_update(struct pipe_context *pipe, struct pipe_query_object *q)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	int id;
+
+	id = nv40_query_object_find(nv40, q);
+	assert(id >= 0);
+
+	if (nv40->nvws->notifier_status(nv40->query, id) == 0) {
+		q->ready = 1;
+		q->count = nv40->nvws->notifier_retval(nv40->query, id);
+		nv40->query_objects[id] = NULL;
+	}
+}
+
+void
+nv40_query_end(struct pipe_context *pipe, struct pipe_query_object *q)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	int id;
+
+	id = nv40_query_object_find(nv40, q);
+	assert(id >= 0);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((id * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING ();
+
+	/*XXX: Some apps spin waiting for GL_QUERY_RESULT_AVAILABLE_ARB.
+	 *     Core mesa won't ask the driver to update the query object's
+	 *     status in this case, so the app waits forever.. fix this some
+	 *     day.
+	 */
+#if 0
+	nv40_query_update(pipe, q);
+#else
+	nv40_query_wait(pipe, q);
+#endif
+}
+
+void
+nv40_query_wait(struct pipe_context *pipe, struct pipe_query_object *q)
+{
+	nv40_query_update(pipe, q);
+	if (!q->ready) {
+		struct nv40_context *nv40 = (struct nv40_context *)pipe;
+		int id;
+		
+		id = nv40_query_object_find(nv40, q);
+		assert(id >= 0);
+
+		nv40->nvws->notifier_wait(nv40->query, id, 0, 0);
+		nv40_query_update(pipe, q);
+		assert(q->ready);
+	}
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_region.c b/src/mesa/pipe/nv40/nv40_region.c
new file mode 100644
index 0000000000..f62bf89d18
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_region.c
@@ -0,0 +1,85 @@
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+
+static ubyte *
+nv40_region_map(struct pipe_context *pipe, struct pipe_region *region)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+
+	if (!region->map_refcount++) {
+		region->map = ws->buffer_map(ws, region->buffer,
+					     PIPE_BUFFER_FLAG_WRITE |
+					     PIPE_BUFFER_FLAG_READ);
+	}
+
+	return region->map;
+}
+
+static void
+nv40_region_unmap(struct pipe_context *pipe, struct pipe_region *region)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+
+	if (!--region->map_refcount) {
+		ws->buffer_unmap(ws, region->buffer);
+		region->map = NULL;
+	}
+}
+
+static void
+nv40_region_data(struct pipe_context *pipe,
+	       struct pipe_region *dst,
+	       unsigned dst_offset,
+	       unsigned dstx, unsigned dsty,
+	       const void *src, unsigned src_pitch,
+	       unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	nvws->region_data(nvws->nv, dst, dst_offset, dstx, dsty,
+			  src, src_pitch, srcx, srcy, width, height);
+}
+
+
+static void
+nv40_region_copy(struct pipe_context *pipe, struct pipe_region *dst,
+		 unsigned dst_offset, unsigned dstx, unsigned dsty,
+		 struct pipe_region *src, unsigned src_offset,
+		 unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	nvws->region_copy(nvws->nv, dst, dst_offset, dstx, dsty,
+			  src, src_offset, srcx, srcy, width, height);
+}
+
+static void
+nv40_region_fill(struct pipe_context *pipe,
+		 struct pipe_region *dst, unsigned dst_offset,
+		 unsigned dstx, unsigned dsty,
+		 unsigned width, unsigned height, unsigned value)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	nvws->region_fill(nvws->nv, dst, dst_offset, dstx, dsty,
+			  width, height, value);
+}
+
+void
+nv40_init_region_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.region_map = nv40_region_map;
+	nv40->pipe.region_unmap = nv40_region_unmap;
+	nv40->pipe.region_data = nv40_region_data;
+	nv40->pipe.region_copy = nv40_region_copy;
+	nv40->pipe.region_fill = nv40_region_fill;
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_shader.h b/src/mesa/pipe/nv40/nv40_shader.h
new file mode 100644
index 0000000000..5b2cf3e293
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_shader.h
@@ -0,0 +1,546 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#    define NV40_VP_INST_COND_FL                                               0
+#    define NV40_VP_INST_COND_LT                                               1
+#    define NV40_VP_INST_COND_EQ                                               2
+#    define NV40_VP_INST_COND_LE                                               3
+#    define NV40_VP_INST_COND_GT                                               4
+#    define NV40_VP_INST_COND_NE                                               5
+#    define NV40_VP_INST_COND_GE                                               6
+#    define NV40_VP_INST_COND_TR                                               7
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_MUL                                             0x02
+#    define NV40_VP_INST_OP_ADD                                             0x03
+#    define NV40_VP_INST_OP_MAD                                             0x04
+#    define NV40_VP_INST_OP_DP3                                             0x05
+#    define NV40_VP_INST_OP_DP4                                             0x07
+#    define NV40_VP_INST_OP_DPH                                             0x06
+#    define NV40_VP_INST_OP_DST                                             0x08
+#    define NV40_VP_INST_OP_MIN                                             0x09
+#    define NV40_VP_INST_OP_MAX                                             0x0A
+#    define NV40_VP_INST_OP_SLT                                             0x0B
+#    define NV40_VP_INST_OP_SGE                                             0x0C
+#    define NV40_VP_INST_OP_ARL                                             0x0D
+#    define NV40_VP_INST_OP_FRC                                             0x0E
+#    define NV40_VP_INST_OP_FLR                                             0x0F
+#    define NV40_VP_INST_OP_SEQ                                             0x10
+#    define NV40_VP_INST_OP_SFL                                             0x11
+#    define NV40_VP_INST_OP_SGT                                             0x12
+#    define NV40_VP_INST_OP_SLE                                             0x13
+#    define NV40_VP_INST_OP_SNE                                             0x14
+#    define NV40_VP_INST_OP_STR                                             0x15
+#    define NV40_VP_INST_OP_SSG                                             0x16
+#    define NV40_VP_INST_OP_ARR                                             0x17
+#    define NV40_VP_INST_OP_ARA                                             0x18
+#    define NV40_VP_INST_OP_TXWHAT                                          0x19
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#    define NV40_VP_INST_OP_RCP                                             0x02
+#    define NV40_VP_INST_OP_RCC                                             0x03
+#    define NV40_VP_INST_OP_RSQ                                             0x04
+#    define NV40_VP_INST_OP_EXP                                             0x05
+#    define NV40_VP_INST_OP_LOG                                             0x06
+#    define NV40_VP_INST_OP_LIT                                             0x07
+#    define NV40_VP_INST_OP_BRA                                             0x09
+#    define NV40_VP_INST_OP_CAL                                             0x0B
+#    define NV40_VP_INST_OP_RET                                             0x0C
+#    define NV40_VP_INST_OP_LG2                                             0x0D
+#    define NV40_VP_INST_OP_EX2                                             0x0E
+#    define NV40_VP_INST_OP_SIN                                             0x0F
+#    define NV40_VP_INST_OP_COS                                             0x10
+#    define NV40_VP_INST_OP_PUSHA                                           0x13
+#    define NV40_VP_INST_OP_POPA                                            0x14
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#    define NV40_VP_INST_IN_POS                                                0
+#    define NV40_VP_INST_IN_WEIGHT                                             1
+#    define NV40_VP_INST_IN_NORMAL                                             2
+#    define NV40_VP_INST_IN_COL0                                               3
+#    define NV40_VP_INST_IN_COL1                                               4
+#    define NV40_VP_INST_IN_FOGC                                               5
+#    define NV40_VP_INST_IN_TC0                                                8
+#    define NV40_VP_INST_IN_TC(n)                                          (8+n)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST_LAST                                               (1 << 0)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *         0 - Opcode, output reg/mask, ATTRIB source
+ *         1 - Source 0
+ *         2 - Source 1
+ *         3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *                 result.color == R0.xyzw
+ *                 result.depth == R1.z
+ * When the fragprog contains instructions to write depth,
+ * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *                 ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and
+ * SWIZZLE_ONE.
+ *
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as
+ * SWIZZLE_ZERO is implemented simply by not writing to the relevant components
+ * of the destination.
+ *
+ * Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *         LIT
+ *         LRP - MAD+MAD
+ *         SUB - ADD, negate second source
+ *         RSQ - LG2 + EX2
+ *         POW - LG2 + MUL + EX2
+ *         SCS - COS + SIN
+ *         XPD
+ *         DP2 - MUL + ADD
+ *         NRM
+ */
+
+//== Opcode / Destination selection ==
+#define NV40_FP_OP_PROGRAM_END                                          (1 << 0)
+#define NV40_FP_OP_OUT_REG_SHIFT                                               1
+#define NV40_FP_OP_OUT_REG_MASK                                        (31 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV40_FP_OP_UNK0_7                                               (1 << 7)
+#define NV40_FP_OP_COND_WRITE_ENABLE                                    (1 << 8)
+#define NV40_FP_OP_OUTMASK_SHIFT                                               9
+#define NV40_FP_OP_OUTMASK_MASK                                       (0xF << 9)
+#    define NV40_FP_OP_OUT_X                                            (1 << 9)
+#    define NV40_FP_OP_OUT_Y                                            (1 <<10)
+#    define NV40_FP_OP_OUT_Z                                            (1 <<11)
+#    define NV40_FP_OP_OUT_W                                            (1 <<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV40_FP_OP_INPUT_SRC_SHIFT                                            13
+#define NV40_FP_OP_INPUT_SRC_MASK                                     (15 << 13)
+#    define NV40_FP_OP_INPUT_SRC_POSITION                                    0x0
+#    define NV40_FP_OP_INPUT_SRC_COL0                                        0x1
+#    define NV40_FP_OP_INPUT_SRC_COL1                                        0x2
+#    define NV40_FP_OP_INPUT_SRC_FOGC                                        0x3
+#    define NV40_FP_OP_INPUT_SRC_TC0                                         0x4
+#    define NV40_FP_OP_INPUT_SRC_TC(n)                                 (0x4 + n)
+#    define NV40_FP_OP_INPUT_SRC_FACING                                      0xE
+#define NV40_FP_OP_TEX_UNIT_SHIFT                                             17
+#define NV40_FP_OP_TEX_UNIT_MASK                                     (0xF << 17)
+#define NV40_FP_OP_PRECISION_SHIFT                                            22
+#define NV40_FP_OP_PRECISION_MASK                                      (3 << 22)
+#   define NV40_FP_PRECISION_FP32                                              0
+#   define NV40_FP_PRECISION_FP16                                              1
+#   define NV40_FP_PRECISION_FX12                                              2
+#define NV40_FP_OP_OPCODE_SHIFT                                               24
+#define NV40_FP_OP_OPCODE_MASK                                      (0x3F << 24)
+#        define NV40_FP_OP_OPCODE_NOP                                       0x00
+#        define NV40_FP_OP_OPCODE_MOV                                       0x01
+#        define NV40_FP_OP_OPCODE_MUL                                       0x02
+#        define NV40_FP_OP_OPCODE_ADD                                       0x03
+#        define NV40_FP_OP_OPCODE_MAD                                       0x04
+#        define NV40_FP_OP_OPCODE_DP3                                       0x05
+#        define NV40_FP_OP_OPCODE_DP4                                       0x06
+#        define NV40_FP_OP_OPCODE_DST                                       0x07
+#        define NV40_FP_OP_OPCODE_MIN                                       0x08
+#        define NV40_FP_OP_OPCODE_MAX                                       0x09
+#        define NV40_FP_OP_OPCODE_SLT                                       0x0A
+#        define NV40_FP_OP_OPCODE_SGE                                       0x0B
+#        define NV40_FP_OP_OPCODE_SLE                                       0x0C
+#        define NV40_FP_OP_OPCODE_SGT                                       0x0D
+#        define NV40_FP_OP_OPCODE_SNE                                       0x0E
+#        define NV40_FP_OP_OPCODE_SEQ                                       0x0F
+#        define NV40_FP_OP_OPCODE_FRC                                       0x10
+#        define NV40_FP_OP_OPCODE_FLR                                       0x11
+#        define NV40_FP_OP_OPCODE_KIL                                       0x12
+#        define NV40_FP_OP_OPCODE_PK4B                                      0x13
+#        define NV40_FP_OP_OPCODE_UP4B                                      0x14
+/* DDX/DDY can only write to XY */
+#        define NV40_FP_OP_OPCODE_DDX                                       0x15
+#        define NV40_FP_OP_OPCODE_DDY                                       0x16
+#        define NV40_FP_OP_OPCODE_TEX                                       0x17
+#        define NV40_FP_OP_OPCODE_TXP                                       0x18
+#        define NV40_FP_OP_OPCODE_TXD                                       0x19
+#        define NV40_FP_OP_OPCODE_RCP                                       0x1A
+#        define NV40_FP_OP_OPCODE_EX2                                       0x1C
+#        define NV40_FP_OP_OPCODE_LG2                                       0x1D
+#        define NV40_FP_OP_OPCODE_COS                                       0x22
+#        define NV40_FP_OP_OPCODE_SIN                                       0x23
+#        define NV40_FP_OP_OPCODE_PK2H                                      0x24
+#        define NV40_FP_OP_OPCODE_UP2H                                      0x25
+#        define NV40_FP_OP_OPCODE_PK4UB                                     0x27
+#        define NV40_FP_OP_OPCODE_UP4UB                                     0x28
+#        define NV40_FP_OP_OPCODE_PK2US                                     0x29
+#        define NV40_FP_OP_OPCODE_UP2US                                     0x2A
+#        define NV40_FP_OP_OPCODE_DP2A                                      0x2E
+#        define NV40_FP_OP_OPCODE_TXL                                       0x2F
+#        define NV40_FP_OP_OPCODE_TXB                                       0x31
+#        define NV40_FP_OP_OPCODE_DIV                                       0x3A
+#        define NV40_FP_OP_OPCODE_UNK_LIT                                   0x3C
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#        define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#        define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#        define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#        define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#        define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#        define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+#define NV40_FP_OP_OUT_SAT                                             (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV40_FP_OP_OUT_ABS                                             (1 << 29)
+#define NV40_FP_OP_COND_SWZ_W_SHIFT                                           27
+#define NV40_FP_OP_COND_SWZ_W_MASK                                     (3 << 27)
+#define NV40_FP_OP_COND_SWZ_Z_SHIFT                                           25
+#define NV40_FP_OP_COND_SWZ_Z_MASK                                     (3 << 25)
+#define NV40_FP_OP_COND_SWZ_Y_SHIFT                                           23
+#define NV40_FP_OP_COND_SWZ_Y_MASK                                     (3 << 23)
+#define NV40_FP_OP_COND_SWZ_X_SHIFT                                           21
+#define NV40_FP_OP_COND_SWZ_X_MASK                                     (3 << 21)
+#define NV40_FP_OP_COND_SWZ_ALL_SHIFT                                         21
+#define NV40_FP_OP_COND_SWZ_ALL_MASK                                (0xFF << 21)
+#define NV40_FP_OP_COND_SHIFT                                                 18
+#define NV40_FP_OP_COND_MASK                                        (0x07 << 18)
+#        define NV40_FP_OP_COND_FL                                             0
+#        define NV40_FP_OP_COND_LT                                             1
+#        define NV40_FP_OP_COND_EQ                                             2
+#        define NV40_FP_OP_COND_LE                                             3
+#        define NV40_FP_OP_COND_GT                                             4
+#        define NV40_FP_OP_COND_NE                                             5
+#        define NV40_FP_OP_COND_GE                                             6
+#        define NV40_FP_OP_COND_TR                                             7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NV40_FP_OP_DST_SCALE_SHIFT                                            28
+#define NV40_FP_OP_DST_SCALE_MASK                                      (3 << 28)
+#define NV40_FP_OP_DST_SCALE_1X                                                0
+#define NV40_FP_OP_DST_SCALE_2X                                                1
+#define NV40_FP_OP_DST_SCALE_4X                                                2
+#define NV40_FP_OP_DST_SCALE_8X                                                3
+#define NV40_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV40_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV40_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+// SRC2 high-order
+#define NV40_FP_OP_INDEX_INPUT                                         (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT                                           19
+#define NV40_FP_OP_ADDR_INDEX_MASK                                   (0xF << 19)
+
+//== Register selection ==
+#define NV40_FP_REG_TYPE_SHIFT                                                 0
+#define NV40_FP_REG_TYPE_MASK                                           (3 << 0)
+#        define NV40_FP_REG_TYPE_TEMP                                          0
+#        define NV40_FP_REG_TYPE_INPUT                                         1
+#        define NV40_FP_REG_TYPE_CONST                                         2
+#define NV40_FP_REG_SRC_SHIFT                                                  2
+#define NV40_FP_REG_SRC_MASK                                           (31 << 2)
+#define NV40_FP_REG_UNK_0                                               (1 << 8)
+#define NV40_FP_REG_SWZ_ALL_SHIFT                                              9
+#define NV40_FP_REG_SWZ_ALL_MASK                                      (255 << 9)
+#define NV40_FP_REG_SWZ_X_SHIFT                                                9
+#define NV40_FP_REG_SWZ_X_MASK                                          (3 << 9)
+#define NV40_FP_REG_SWZ_Y_SHIFT                                               11
+#define NV40_FP_REG_SWZ_Y_MASK                                         (3 << 11)
+#define NV40_FP_REG_SWZ_Z_SHIFT                                               13
+#define NV40_FP_REG_SWZ_Z_MASK                                         (3 << 13)
+#define NV40_FP_REG_SWZ_W_SHIFT                                               15
+#define NV40_FP_REG_SWZ_W_MASK                                         (3 << 15)
+#        define NV40_FP_SWIZZLE_X                                              0
+#        define NV40_FP_SWIZZLE_Y                                              1
+#        define NV40_FP_SWIZZLE_Z                                              2
+#        define NV40_FP_SWIZZLE_W                                              3
+#define NV40_FP_REG_NEGATE                                             (1 << 17)
+
+struct nv40_sreg {
+	int output;
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv40_sreg
+nv40_sr(int out, int type, int index)
+{
+	struct nv40_sreg temp = {
+		.output = out,
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w)
+{
+	struct nv40_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_neg(struct nv40_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_abs(struct nv40_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_scale(struct nv40_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c
new file mode 100644
index 0000000000..e38a5ea534
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_state.c
@@ -0,0 +1,674 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+#include "nv40_state.h"
+
+#include "nvgl_pipe.h"
+
+static void *
+nv40_alpha_test_state_create(struct pipe_context *pipe,
+			     const struct pipe_alpha_test_state *cso)
+{
+	struct nv40_alpha_test_state *at;
+
+	at = malloc(sizeof(struct nv40_alpha_test_state));
+
+	at->enabled = cso->enabled ? 1 : 0;
+	if (at->enabled) {
+		at->func = nvgl_comparison_op(cso->func);
+		at->ref  = float_to_ubyte(cso->ref);
+	}
+
+	return (void *)at;
+}
+
+static void
+nv40_alpha_test_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nv40_alpha_test_state *at = hwcso;
+
+	if (at->enabled) {
+		BEGIN_RING(curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+		OUT_RING  (at->enabled);
+		OUT_RING  (at->func);
+		OUT_RING  (at->ref);
+	} else {
+		BEGIN_RING(curie, NV40TCL_ALPHA_TEST_ENABLE, 1);
+		OUT_RING  (0);
+	}
+}
+
+static void
+nv40_alpha_test_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv40_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv40_blend_state *cb;
+
+	cb = malloc(sizeof(struct nv40_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	if (cb->b_enable) {
+		cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+				 (nvgl_blend_func(cso->rgb_src_factor)));
+		cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+				 (nvgl_blend_func(cso->rgb_dst_factor)));
+		cb->b_eqn = ((nvgl_blend_eqn(cso->alpha_func) << 16) |
+			     (nvgl_blend_eqn(cso->rgb_func)));
+	}
+
+	cb->l_enable = cso->logicop_enable ? 1 : 0;
+	if (cb->l_enable) {
+		cb->l_op = nvgl_logicop_func(cso->logicop_func);
+	}
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nv40_blend_state *cb = hwcso;
+
+	BEGIN_RING(curie, NV40TCL_DITHER_ENABLE, 1);
+	OUT_RING  (cb->d_enable);
+
+	if (cb->b_enable) {
+		BEGIN_RING(curie, NV40TCL_BLEND_ENABLE, 3);
+		OUT_RING  (cb->b_enable);
+		OUT_RING  (cb->b_srcfunc);
+		OUT_RING  (cb->b_dstfunc);
+		BEGIN_RING(curie, NV40TCL_BLEND_EQUATION, 2);
+		OUT_RING  (cb->b_eqn);
+		OUT_RING  (cb->c_mask);
+	} else {
+		BEGIN_RING(curie, NV40TCL_BLEND_ENABLE, 1);
+		OUT_RING  (0);
+	}
+
+	if (cb->l_enable) {
+		BEGIN_RING(curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		OUT_RING  (cb->l_enable);
+		OUT_RING  (cb->l_op);
+	} else {
+		BEGIN_RING(curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		OUT_RING  (0);
+	}
+}
+
+static void
+nv40_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv40_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv40_sampler_state *ps;
+	
+	ps = malloc(sizeof(struct nv40_sampler_state));
+	
+	ps->wrap = ((nv40_tex_wrap_mode(cso->wrap_r) << 16) |
+		    (nv40_tex_wrap_mode(cso->wrap_t) <<  8) |
+		    (nv40_tex_wrap_mode(cso->wrap_s) <<  0));
+	ps->filt = ((nv40_tex_filter(cso->min_img_filter,
+				     cso->min_mip_filter) << 16) |
+		    (nv40_tex_filter(cso->mag_img_filter,
+				     PIPE_TEX_MIPFILTER_NONE) << 24));
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv40_sampler_state_bind(struct pipe_context *pipe, unsigned unit,
+			void *hwcso)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nv40_sampler_state *ps = hwcso;
+
+	nv40->tex_sampler[unit]  = ps;
+	nv40->tex_dirty         |= (1 << unit);
+
+	nv40->dirty |= NV40_NEW_TEXTURE;
+}
+
+static void
+nv40_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv40_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv40_rasterizer_state *rs;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = malloc(sizeof(struct nv40_rasterizer_state));
+
+	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+	rs->line_stipple_en = cso->line_stipple_enable ? 1 : 0;
+	rs->line_stipple = (cso->line_stipple_pattern << 16) |
+			    cso->line_stipple_factor;
+
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+	rs->poly_stipple_en = cso->poly_stipple_enable ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = 0x0901;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = 0x0900;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	rs->cull_face_en = 0;
+	rs->cull_face    = 0x0900;
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face = 0x0901;
+		/* fall-through */
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		break;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nv40_rasterizer_state *rs = hwcso;
+
+	BEGIN_RING(curie, NV40TCL_SHADE_MODEL, 1);
+	OUT_RING  (rs->shade_model);
+
+	BEGIN_RING(curie, NV40TCL_LINE_WIDTH, 2);
+	OUT_RING  (rs->line_width);
+	OUT_RING  (rs->line_smooth_en);
+	BEGIN_RING(curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	OUT_RING  (rs->line_stipple_en);
+	OUT_RING  (rs->line_stipple);
+
+	BEGIN_RING(curie, NV40TCL_POINT_SIZE, 1);
+	OUT_RING  (rs->point_size);
+
+	BEGIN_RING(curie, NV40TCL_POLYGON_MODE_FRONT, 6);
+	OUT_RING  (rs->poly_mode_front);
+	OUT_RING  (rs->poly_mode_back);
+	OUT_RING  (rs->cull_face);
+	OUT_RING  (rs->front_face);
+	OUT_RING  (rs->poly_smooth_en);
+	OUT_RING  (rs->cull_face_en);
+
+	BEGIN_RING(curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (rs->poly_stipple_en);
+}
+
+static void
+nv40_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv40_depth_stencil_state_create(struct pipe_context *pipe,
+				const struct pipe_depth_stencil_state *cso)
+{
+	struct nv40_depth_stencil_state *zs;
+
+	/*XXX: ignored:
+	 * 	depth.occlusion_count
+	 * 	depth.clear
+	 * 	stencil.clear_value
+	 */
+	zs = malloc(sizeof(struct nv40_depth_stencil_state));
+
+	zs->depth.func		= nvgl_comparison_op(cso->depth.func);
+	zs->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	zs->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	zs->stencil.back.enable	= cso->stencil.back_enabled ? 1 : 0;
+	zs->stencil.back.wmask	= cso->stencil.write_mask[1];
+	zs->stencil.back.func	=
+		nvgl_comparison_op(cso->stencil.back_func);
+	zs->stencil.back.ref	= cso->stencil.ref_value[1];
+	zs->stencil.back.vmask	= cso->stencil.value_mask[1];
+	zs->stencil.back.fail	= nvgl_stencil_op(cso->stencil.back_fail_op);
+	zs->stencil.back.zfail	= nvgl_stencil_op(cso->stencil.back_zfail_op);
+	zs->stencil.back.zpass	= nvgl_stencil_op(cso->stencil.back_zpass_op);
+
+	zs->stencil.front.enable= cso->stencil.front_enabled ? 1 : 0;
+	zs->stencil.front.wmask	= cso->stencil.write_mask[0];
+	zs->stencil.front.func	=
+		nvgl_comparison_op(cso->stencil.front_func);
+	zs->stencil.front.ref	= cso->stencil.ref_value[0];
+	zs->stencil.front.vmask	= cso->stencil.value_mask[0];
+	zs->stencil.front.fail	= nvgl_stencil_op(cso->stencil.front_fail_op);
+	zs->stencil.front.zfail	= nvgl_stencil_op(cso->stencil.front_zfail_op);
+	zs->stencil.front.zpass	= nvgl_stencil_op(cso->stencil.front_zpass_op);
+
+	return (void *)zs;
+}
+
+static void
+nv40_depth_stencil_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nv40_depth_stencil_state *zs = hwcso;
+
+	BEGIN_RING(curie, NV40TCL_DEPTH_FUNC, 3);
+	OUT_RINGp ((uint32_t *)&zs->depth, 3);
+	BEGIN_RING(curie, NV40TCL_STENCIL_BACK_ENABLE, 16);
+	OUT_RINGp ((uint32_t *)&zs->stencil.back, 8);
+	OUT_RINGp ((uint32_t *)&zs->stencil.front, 8);
+}
+
+static void
+nv40_depth_stencil_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv40_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_vertex_program *vp;
+
+	vp = calloc(1, sizeof(struct nv40_vertex_program));
+	vp->pipe = cso;
+
+	return (void *)vp;
+}
+
+static void
+nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nv40_vertex_program *vp = hwcso;
+
+	nv40->vertprog.vp = vp;
+	nv40->dirty |= NV40_NEW_VERTPROG;
+}
+
+static void
+nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv40_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_fragment_program *fp;
+
+	fp = calloc(1, sizeof(struct nv40_fragment_program));
+	fp->pipe = cso;
+
+	return (void *)fp;
+}
+
+static void
+nv40_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nv40_fragment_program *fp = hwcso;
+
+	nv40->fragprog.fp = fp;
+	nv40->dirty |= NV40_NEW_FRAGPROG;
+}
+
+static void
+nv40_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void
+nv40_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	BEGIN_RING(curie, NV40TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(bcol->color[3]) << 24) |
+		   (float_to_ubyte(bcol->color[0]) << 16) |
+		   (float_to_ubyte(bcol->color[1]) <<  8) |
+		   (float_to_ubyte(bcol->color[2]) <<  0));
+}
+
+static void
+nv40_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	
+	nv40->dirty |= NV40_NEW_VERTPROG;
+}
+
+static void
+nv40_set_clear_color_state(struct pipe_context *pipe,
+			   const struct pipe_clear_color_state *ccol)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	BEGIN_RING(curie, NV40TCL_CLEAR_VALUE_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(ccol->color[3]) << 24) |
+		   (float_to_ubyte(ccol->color[0]) << 16) |
+		   (float_to_ubyte(ccol->color[1]) <<  8) |
+		   (float_to_ubyte(ccol->color[2]) <<  0));
+}
+
+static void
+nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv40->vertprog.constant_buf = buf->buffer;
+		nv40->dirty |= NV40_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv40->fragprog.constant_buf = buf->buffer;
+		nv40->dirty |= NV40_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv40_set_feedback_state(struct pipe_context *pipe,
+			const struct pipe_feedback_state *feedback)
+{
+	NOUVEAU_ERR("\n");
+}
+
+#define get_region(surf) ((surf) ? surf->region : NULL)
+static void
+nv40_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct pipe_region *region;
+	uint32_t rt_enable = 0, rt_format = 0;
+
+	if ((region = get_region(fb->cbufs[0]))) {
+		rt_enable |= NV40TCL_RT_ENABLE_COLOR0;
+
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1);
+		OUT_RELOCo(region->buffer, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 2);
+		OUT_RING  (region->pitch * region->cpp);
+		OUT_RELOCl(region->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+
+	if ((region = get_region(fb->cbufs[1]))) {
+		rt_enable |= NV40TCL_RT_ENABLE_COLOR1;
+
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
+		OUT_RELOCo(region->buffer, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 2);
+		OUT_RELOCl(region->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		OUT_RING  (region->pitch * region->cpp);
+	}
+
+	if ((region = get_region(fb->cbufs[2]))) {
+		rt_enable |= NV40TCL_RT_ENABLE_COLOR2;
+
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1);
+		OUT_RELOCo(region->buffer, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1);
+		OUT_RELOCl(region->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR2_PITCH, 1);
+		OUT_RING  (region->pitch * region->cpp);
+	}
+
+	if ((region = get_region(fb->cbufs[3]))) {
+		rt_enable |= NV40TCL_RT_ENABLE_COLOR3;
+
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1);
+		OUT_RELOCo(region->buffer, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1);
+		OUT_RELOCl(region->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR3_PITCH, 1);
+		OUT_RING  (region->pitch * region->cpp);
+	}
+
+	if ((region = get_region(fb->zbuf))) {
+		BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1);
+		OUT_RELOCo(region->buffer,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR | NOUVEAU_BO_RD);
+		BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(region->buffer, 0,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR | NOUVEAU_BO_RD);
+		BEGIN_RING(curie, NV40TCL_ZETA_PITCH, 1);
+		OUT_RING  (region->pitch * region->cpp);
+	}
+
+	if (rt_enable & (NV40TCL_RT_ENABLE_COLOR1 | NV40TCL_RT_ENABLE_COLOR2 |
+			 NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV40TCL_RT_ENABLE_MRT;
+	BEGIN_RING(curie, NV40TCL_RT_ENABLE, 1);
+	OUT_RING  (rt_enable);
+
+	if (0) {
+#if 0
+		rt_format |= (log2width <<
+			      NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT);
+		rt_format |= (log2height <<
+			      NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+#endif
+		rt_format |= (NV40TCL_RT_FORMAT_TYPE_SWIZZLED <<
+			      NV40TCL_RT_FORMAT_TYPE_SHIFT);
+	} else {
+		rt_format |= (NV40TCL_RT_FORMAT_TYPE_LINEAR <<
+			      NV40TCL_RT_FORMAT_TYPE_SHIFT);
+	}
+
+	if (fb->cbufs[0]->format == PIPE_FORMAT_U_R5_G6_B5) {
+		rt_format |= (NV40TCL_RT_FORMAT_COLOR_R5G6B5 <<
+			      NV40TCL_RT_FORMAT_COLOR_SHIFT);
+	} else {
+		rt_format |= (NV40TCL_RT_FORMAT_COLOR_A8R8G8B8 <<
+			      NV40TCL_RT_FORMAT_COLOR_SHIFT);
+	}
+
+	if (fb->zbuf && fb->zbuf->format == PIPE_FORMAT_U_Z16) {
+		rt_format |= (NV40TCL_RT_FORMAT_DEPTH_Z16 <<
+			      NV40TCL_RT_FORMAT_DEPTH_SHIFT);
+	} else {
+		rt_format |= (NV40TCL_RT_FORMAT_DEPTH_Z24S8 <<
+			      NV40TCL_RT_FORMAT_DEPTH_SHIFT);
+	}
+
+	BEGIN_RING(curie, NV40TCL_RT_HORIZ, 3);
+	OUT_RING  ((fb->cbufs[0]->width  << 16) | 0);
+	OUT_RING  ((fb->cbufs[0]->height << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	OUT_RING  ((fb->cbufs[0]->width  << 16) | 0);
+	OUT_RING  ((fb->cbufs[0]->height << 16) | 0);
+	BEGIN_RING(curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((fb->cbufs[0]->width - 1)  << 16) | 0);
+	OUT_RING  (((fb->cbufs[0]->height - 1) << 16) | 0);
+}
+
+static void
+nv40_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	BEGIN_RING(curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+	OUT_RINGp ((uint32_t *)stipple->stipple, 32);
+}
+
+static void
+nv40_set_sampler_units(struct pipe_context *pipe,
+		       uint num_samplers, const uint *units)
+{
+}
+
+static void
+nv40_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	BEGIN_RING(curie, NV40TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);
+}
+
+static void
+nv40_set_texture_state(struct pipe_context *pipe, unsigned unit,
+		       struct pipe_mipmap_tree *miptree)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	nv40->tex_miptree[unit]  = miptree;
+	nv40->tex_dirty         |= unit;
+
+	nv40->dirty |= NV40_NEW_TEXTURE;
+}
+
+static void
+nv40_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	BEGIN_RING(curie, NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+	OUT_RINGf (vpt->translate[0]);
+	OUT_RINGf (vpt->translate[1]);
+	OUT_RINGf (vpt->translate[2]);
+	OUT_RINGf (vpt->translate[3]);
+	OUT_RINGf (vpt->scale[0]);
+	OUT_RINGf (vpt->scale[1]);
+	OUT_RINGf (vpt->scale[2]);
+	OUT_RINGf (vpt->scale[3]);
+}
+
+static void
+nv40_set_vertex_buffer(struct pipe_context *pipe, unsigned index,
+		       const struct pipe_vertex_buffer *vb)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	nv40->vtxbuf[index] = *vb;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_vertex_element(struct pipe_context *pipe, unsigned index,
+			const struct pipe_vertex_element *ve)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+
+	nv40->vtxelt[index] = *ve;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_feedback_buffer(struct pipe_context *pipe, unsigned index,
+			 const struct pipe_feedback_buffer *fbb)
+{
+	NOUVEAU_ERR("\n");
+}
+
+void
+nv40_init_state_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_alpha_test_state = nv40_alpha_test_state_create;
+	nv40->pipe.bind_alpha_test_state = nv40_alpha_test_state_bind;
+	nv40->pipe.delete_alpha_test_state = nv40_alpha_test_state_delete;
+
+	nv40->pipe.create_blend_state = nv40_blend_state_create;
+	nv40->pipe.bind_blend_state = nv40_blend_state_bind;
+	nv40->pipe.delete_blend_state = nv40_blend_state_delete;
+
+	nv40->pipe.create_sampler_state = nv40_sampler_state_create;
+	nv40->pipe.bind_sampler_state = nv40_sampler_state_bind;
+	nv40->pipe.delete_sampler_state = nv40_sampler_state_delete;
+
+	nv40->pipe.create_rasterizer_state = nv40_rasterizer_state_create;
+	nv40->pipe.bind_rasterizer_state = nv40_rasterizer_state_bind;
+	nv40->pipe.delete_rasterizer_state = nv40_rasterizer_state_delete;
+
+	nv40->pipe.create_depth_stencil_state = nv40_depth_stencil_state_create;
+	nv40->pipe.bind_depth_stencil_state = nv40_depth_stencil_state_bind;
+	nv40->pipe.delete_depth_stencil_state = nv40_depth_stencil_state_delete;
+
+	nv40->pipe.create_vs_state = nv40_vp_state_create;
+	nv40->pipe.bind_vs_state = nv40_vp_state_bind;
+	nv40->pipe.delete_vs_state = nv40_vp_state_delete;
+
+	nv40->pipe.create_fs_state = nv40_fp_state_create;
+	nv40->pipe.bind_fs_state = nv40_fp_state_bind;
+	nv40->pipe.delete_fs_state = nv40_fp_state_delete;
+
+	nv40->pipe.set_blend_color = nv40_set_blend_color;
+	nv40->pipe.set_clip_state = nv40_set_clip_state;
+	nv40->pipe.set_clear_color_state = nv40_set_clear_color_state;
+	nv40->pipe.set_constant_buffer = nv40_set_constant_buffer;
+//	nv40->pipe.set_feedback_state = nv40_set_feedback_state;
+	nv40->pipe.set_framebuffer_state = nv40_set_framebuffer_state;
+	nv40->pipe.set_polygon_stipple = nv40_set_polygon_stipple;
+	nv40->pipe.set_sampler_units = nv40_set_sampler_units;
+	nv40->pipe.set_scissor_state = nv40_set_scissor_state;
+	nv40->pipe.set_texture_state = nv40_set_texture_state;
+	nv40->pipe.set_viewport_state = nv40_set_viewport_state;
+
+	nv40->pipe.set_vertex_buffer = nv40_set_vertex_buffer;
+	nv40->pipe.set_vertex_element = nv40_set_vertex_element;
+
+//	nv40->pipe.set_feedback_buffer = nv40_set_feedback_buffer;
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_state.h b/src/mesa/pipe/nv40/nv40_state.h
new file mode 100644
index 0000000000..1535037f63
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_state.h
@@ -0,0 +1,173 @@
+#ifndef __NV40_STATE_H__
+#define __NV40_STATE_H__
+
+#include "pipe/p_state.h"
+
+struct nv40_alpha_test_state {
+	uint32_t enabled;
+	uint32_t func;
+	uint32_t ref;
+};
+
+struct nv40_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+	uint32_t b_eqn;
+
+	uint32_t l_enable;
+	uint32_t l_op;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv40_sampler_state {
+	uint32_t wrap;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv40_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+	uint32_t line_stipple_en;
+	uint32_t line_stipple;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	uint32_t poly_stipple_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+};
+
+struct nv40_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	boolean on_hw;
+	int start_ip;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct {
+		int pipe_id;
+		int hw_id;
+		float value[4];
+	} consts[256];
+	int num_consts;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv40_fragment_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	boolean on_hw;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct {
+		int pipe_id;
+		int hw_id;
+	} consts[256];
+	int num_consts;
+
+	struct pipe_buffer_handle *buffer;
+
+	boolean uses_kil;
+	boolean writes_depth;
+	int     num_regs;
+};
+
+struct nv40_depth_push {
+	uint32_t func;
+	uint32_t write_enable;
+	uint32_t test_enable;
+};
+
+struct nv40_stencil_push {
+	uint32_t enable;
+	uint32_t wmask;
+	uint32_t func;
+	uint32_t ref;
+	uint32_t vmask;
+	uint32_t fail;
+	uint32_t zfail;
+	uint32_t zpass;
+};
+
+struct nv40_depth_stencil_state {
+	struct nv40_depth_push depth;
+	union {
+		struct nv40_stencil_push back;
+		struct nv40_stencil_push front;
+	} stencil;
+};
+
+static INLINE unsigned
+nv40_tex_wrap_mode(unsigned wrap) {
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		return NV40TCL_TEX_WRAP_S_REPEAT;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		return NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		return NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		return NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_CLAMP:
+		return NV40TCL_TEX_WRAP_S_CLAMP;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		return NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		return NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		return NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+	default:
+		return NV40TCL_TEX_WRAP_S_REPEAT;
+	}
+}
+
+static INLINE unsigned
+nv40_tex_filter(unsigned f0, unsigned f1) {
+	switch (f0) {
+	case PIPE_TEX_FILTER_NEAREST:
+		switch (f1) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			return NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			return NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			return NV40TCL_TEX_FILTER_MIN_NEAREST;
+		}
+	case PIPE_TEX_FILTER_LINEAR:
+	default:
+		switch (f1) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			return NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			return NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			return NV40TCL_TEX_FILTER_MIN_LINEAR;
+		}
+	}
+}
+
+#endif
diff --git a/src/mesa/pipe/nv40/nv40_state_emit.c b/src/mesa/pipe/nv40/nv40_state_emit.c
new file mode 100644
index 0000000000..a29c70538f
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_state_emit.c
@@ -0,0 +1,112 @@
+#include "nv40_context.h"
+#include "nv40_dma.h"
+#include "nv40_state.h"
+
+static INLINE void
+nv40_state_update_fragprog(struct nv40_context *nv40)
+{
+	struct pipe_context *pipe = (struct pipe_context *)nv40;
+	struct nv40_fragment_program *fp = nv40->fragprog.fp;
+	float *map;
+	int i;
+
+	if (!fp->translated)
+		nv40_fragprog_translate(nv40, fp);
+
+	if (fp->num_consts) {
+		map = pipe->winsys->buffer_map(pipe->winsys,
+					       nv40->fragprog.constant_buf,
+					       PIPE_BUFFER_FLAG_READ);
+		for (i = 0; i < fp->num_consts; i++) {
+			uint pid = fp->consts[i].pipe_id;
+
+			if (pid == -1)
+				continue;
+
+			if (!memcmp(&fp->insn[fp->consts[i].hw_id], &map[pid*4],
+				    4 * sizeof(float)))
+				continue;
+
+			memcpy(&fp->insn[fp->consts[i].hw_id], &map[pid*4],
+			       4 * sizeof(float));
+			fp->on_hw = 0;
+		}
+		pipe->winsys->buffer_unmap(pipe->winsys,
+					   nv40->fragprog.constant_buf);
+	}
+}
+
+static INLINE void
+nv40_state_update_vertprog(struct nv40_context *nv40)
+{
+	struct pipe_context *pipe = (struct pipe_context *)nv40;
+	struct nv40_vertex_program *vp = nv40->vertprog.vp;
+	float *map;
+	int i;
+
+	if (!nv40->vertprog.vp->translated)
+		nv40_vertprog_translate(nv40, nv40->vertprog.vp);
+
+	if (vp->num_consts) {
+		map = pipe->winsys->buffer_map(pipe->winsys,
+					       nv40->vertprog.constant_buf,
+					       PIPE_BUFFER_FLAG_READ);
+		for (i = 0; i < vp->num_consts; i++) {
+			uint pid = vp->consts[i].pipe_id;
+
+			if (pid >= 0) {
+				if (!memcmp(vp->consts[i].value, &map[pid*4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vp->consts[i].value, &map[pid*4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (vp->consts[i].hw_id);
+			OUT_RINGp ((uint32_t *)vp->consts[i].value, 4);
+		}
+		pipe->winsys->buffer_unmap(pipe->winsys,
+					   nv40->vertprog.constant_buf);
+	}
+}
+
+void
+nv40_emit_hw_state(struct nv40_context *nv40)
+{
+	if (nv40->dirty & NV40_NEW_FRAGPROG) {
+		struct nv40_fragment_program *cur = nv40->fragprog.fp;
+
+		nv40_state_update_fragprog(nv40);
+	
+		if (cur->on_hw)
+			nv40->dirty &= ~NV40_NEW_FRAGPROG;
+
+		if (!cur->on_hw || cur != nv40->fragprog.active_fp)
+			nv40_fragprog_bind(nv40, cur);
+	}
+
+	if (nv40->dirty & NV40_NEW_TEXTURE)
+		nv40_state_tex_update(nv40);
+
+	if (nv40->dirty & (NV40_NEW_TEXTURE | NV40_NEW_FRAGPROG)) {
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
+		nv40->dirty &= ~(NV40_NEW_TEXTURE | NV40_NEW_FRAGPROG);
+	}
+
+	if (nv40->dirty & NV40_NEW_VERTPROG) {
+		nv40_state_update_vertprog(nv40);
+		if (nv40->vertprog.vp != nv40->vertprog.active_vp)
+			nv40_vertprog_bind(nv40, nv40->vertprog.vp);
+		nv40->dirty &= ~NV40_NEW_VERTPROG;
+	}
+
+	if (nv40->dirty & NV40_NEW_ARRAYS) {
+		nv40_vbo_arrays_update(nv40);
+		nv40->dirty &= ~NV40_NEW_ARRAYS;
+	}
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_state_tex.c b/src/mesa/pipe/nv40/nv40_state_tex.c
new file mode 100644
index 0000000000..a92d6250a2
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_state_tex.c
@@ -0,0 +1,140 @@
+#include "nv40_context.h"
+#include "nv40_dma.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0x << NV40TCL_TEX_SWIZZLE_S0_X_SHIFT) |        \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0y << NV40TCL_TEX_SWIZZLE_S0_Y_SHIFT) |        \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0z << NV40TCL_TEX_SWIZZLE_S0_Z_SHIFT) |        \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0w << NV40TCL_TEX_SWIZZLE_S0_W_SHIFT) |        \
+  (NV40TCL_TEX_SWIZZLE_S1_X_##ts1x << NV40TCL_TEX_SWIZZLE_S1_X_SHIFT) |        \
+  (NV40TCL_TEX_SWIZZLE_S1_X_##ts1y << NV40TCL_TEX_SWIZZLE_S1_Y_SHIFT) |        \
+  (NV40TCL_TEX_SWIZZLE_S1_X_##ts1z << NV40TCL_TEX_SWIZZLE_S1_Z_SHIFT) |        \
+  (NV40TCL_TEX_SWIZZLE_S1_X_##ts1w << NV40TCL_TEX_SWIZZLE_S1_W_SHIFT),         \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(U_A8_R8_G8_B8, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(U_A1_R5_G5_B5, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(U_A4_R4_G4_B4, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(U_R5_G6_B5   , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(U_L8         , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(U_A8         , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(U_I8         , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(U_A8_L8      , A8L8    ,   S1,   S1,   S1,   S1, Z, W, X, Y),
+//	_(RGB_DXT1    , 0x86,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0x00, 0x00),
+//	_(RGBA_DXT1   , 0x86,   S1,   S1,   S1,   S1, X, Y, Z, W, 0x00, 0x00),
+//	_(RGBA_DXT3   , 0x87,   S1,   S1,   S1,   S1, X, Y, Z, W, 0x00, 0x00),
+//	_(RGBA_DXT5   , 0x88,   S1,   S1,   S1,   S1, X, Y, Z, W, 0x00, 0x00),
+};
+
+static struct nv40_texture_format *
+nv40_tex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+static INLINE int
+nv40_tex_dims(uint pipe_target)
+{
+	switch (pipe_target) {
+	case PIPE_TEXTURE_1D: return 1;
+	case PIPE_TEXTURE_2D: return 2;
+	case PIPE_TEXTURE_3D: return 3;
+	case PIPE_TEXTURE_CUBE: return 2;
+	default:
+		NOUVEAU_ERR("AII unknown pipe target: %d\n", pipe_target);
+		return 2;
+	}
+}
+
+static void
+nv40_tex_unit_enable(struct nv40_context *nv40, int unit)
+{
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct nv40_sampler_state *ps = nv40->tex_sampler[unit];
+	struct pipe_mipmap_tree *mt = nv40->tex_miptree[unit];
+	struct nv40_texture_format *tf;
+	uint32_t txf, txs, txp;
+	int swizzled = 0; /*XXX: implement in region code? */
+
+	tf = nv40_tex_format(mt->format);
+	if (!tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", mt->format);
+		return;
+	}
+
+	txf  = (tf->format | 0x80) << NV40TCL_TEX_FORMAT_FORMAT_SHIFT;
+	txf |= ((mt->last_level - mt->first_level + 1) <<
+		NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+	if (1) /* XXX */
+		txf |= NV40TCL_TEX_FORMAT_NO_BORDER;
+
+	txf |= (nv40_tex_dims(mt->target) << NV40TCL_TEX_FORMAT_DIMS_SHIFT);
+	if (0) /*XXX*/
+		txf |= NV40TCL_TEX_FORMAT_RECT;
+
+	if (swizzled) {
+		txp = 0;
+	} else {
+		txp  = mt->pitch * mt->cpp;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+	if (mt->format == PIPE_FORMAT_U_A8_L8)
+		txs |= (1<<16); /*nfi*/
+
+	BEGIN_RING(curie, NV40TCL_TEX_OFFSET(unit), 8);
+	OUT_RELOCl(mt->region->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		   NOUVEAU_BO_RD);
+	OUT_RELOCd(mt->region->buffer, txf, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		   NOUVEAU_BO_OR | NOUVEAU_BO_RD, NV40TCL_TEX_FORMAT_DMA0,
+		   NV40TCL_TEX_FORMAT_DMA1);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (NV40TCL_TEX_ENABLE_ENABLE |
+		   (0x00078000) /* mipmap related? */);
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x3fd6 /*voodoo*/);
+	OUT_RING  ((mt->width0 << NV40TCL_TEX_SIZE0_W_SHIFT) | mt->height0);
+	OUT_RING  (ps->bcol);
+	BEGIN_RING(curie, NV40TCL_TEX_SIZE1(unit), 1);
+	OUT_RING  ((mt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+}
+
+void
+nv40_state_tex_update(struct nv40_context *nv40)
+{
+	while (nv40->tex_dirty) {
+		int unit = ffs(nv40->tex_dirty) - 1;
+
+		if (nv40->tex_miptree[unit]) {
+			nv40_tex_unit_enable(nv40, unit);
+		} else {
+			BEGIN_RING(curie, NV40TCL_TEX_ENABLE(unit), 1);
+			OUT_RING  (0);
+		}
+
+		nv40->tex_dirty &= ~(1 << unit);
+	}
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_surface.c b/src/mesa/pipe/nv40/nv40_surface.c
new file mode 100644
index 0000000000..84e0d79268
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_surface.c
@@ -0,0 +1,229 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv40_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+
+#define CLIP_TILE \
+   do { \
+      if (x >= ps->width) \
+         return; \
+      if (y >= ps->height) \
+         return; \
+      if (x + w > ps->width) \
+         w = ps->width - x; \
+      if (y + h > ps->height) \
+         h = ps->height -y; \
+   } while(0)
+
+
+/**
+ * Note: this is exactly like a8r8g8b8_get_tile() in sp_surface.c
+ * Share it someday.
+ */
+static void
+nv40_get_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h, float *p)
+{
+   const unsigned *src
+      = ((const unsigned *) (ps->region->map + ps->offset))
+      + y * ps->region->pitch + x;
+   unsigned i, j;
+   unsigned w0 = w;
+
+   CLIP_TILE;
+
+   switch (ps->format) {
+   case PIPE_FORMAT_U_A8_R8_G8_B8:
+      for (i = 0; i < h; i++) {
+         float *pRow = p;
+         for (j = 0; j < w; j++) {
+            const unsigned pixel = src[j];
+            pRow[0] = UBYTE_TO_FLOAT((pixel >> 16) & 0xff);
+            pRow[1] = UBYTE_TO_FLOAT((pixel >>  8) & 0xff);
+            pRow[2] = UBYTE_TO_FLOAT((pixel >>  0) & 0xff);
+            pRow[3] = UBYTE_TO_FLOAT((pixel >> 24) & 0xff);
+            pRow += 4;
+         }
+         src += ps->region->pitch;
+         p += w0 * 4;
+      }
+      break;
+   case PIPE_FORMAT_S8_Z24:
+      {
+         const float scale = 1.0 / (float) 0xffffff;
+         for (i = 0; i < h; i++) {
+            float *pRow = p;
+            for (j = 0; j < w; j++) {
+               const unsigned pixel = src[j];
+               pRow[0] =
+               pRow[1] =
+               pRow[2] =
+               pRow[3] = (pixel & 0xffffff) * scale;
+               pRow += 4;
+            }
+            src += ps->region->pitch;
+            p += w0 * 4;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+nv40_put_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h, const float *p)
+{
+   /* TODO */
+   assert(0);
+}
+
+
+/*
+ * XXX note: same as code in sp_surface.c
+ */
+static void
+nv40_get_tile(struct pipe_context *pipe,
+              struct pipe_surface *ps,
+              uint x, uint y, uint w, uint h,
+              void *p, int dst_stride)
+{
+   const uint cpp = ps->region->cpp;
+   const uint w0 = w;
+   const ubyte *pSrc;
+   ubyte *pDest;
+   uint i;
+
+   assert(ps->region->map);
+
+   CLIP_TILE;
+
+   if (dst_stride == 0) {
+      dst_stride = w0 * cpp;
+   }
+
+   pSrc = ps->region->map + ps->offset + (y * ps->region->pitch + x) * cpp;
+   pDest = (ubyte *) p;
+
+   for (i = 0; i < h; i++) {
+      memcpy(pDest, pSrc, w0 * cpp);
+      pDest += dst_stride;
+      pSrc += ps->region->pitch * cpp;
+   }
+}
+
+
+/*
+ * XXX note: same as code in sp_surface.c
+ */
+static void
+nv40_put_tile(struct pipe_context *pipe,
+              struct pipe_surface *ps,
+              uint x, uint y, uint w, uint h,
+              const void *p, int src_stride)
+{
+   const uint cpp = ps->region->cpp;
+   const uint w0 = w;
+   const ubyte *pSrc;
+   ubyte *pDest;
+   uint i;
+
+   assert(ps->region->map);
+
+   CLIP_TILE;
+
+   if (src_stride == 0) {
+      src_stride = w0 * cpp;
+   }
+
+   pSrc = (const ubyte *) p;
+   pDest = ps->region->map + ps->offset + (y * ps->region->pitch + x) * cpp;
+
+   for (i = 0; i < h; i++) {
+      memcpy(pDest, pSrc, w0 * cpp);
+      pDest += ps->region->pitch * cpp;
+      pSrc += src_stride;
+   }
+}
+
+
+/*
+ * XXX note: same as code in sp_surface.c
+ */
+static struct pipe_surface *
+nv40_get_tex_surface(struct pipe_context *pipe,
+                     struct pipe_mipmap_tree *mt,
+                     unsigned face, unsigned level, unsigned zslice)
+{
+   struct pipe_surface *ps;
+   unsigned offset;  /* in bytes */
+
+   offset = mt->level[level].level_offset;
+
+   if (mt->target == PIPE_TEXTURE_CUBE) {
+      offset += mt->level[level].image_offset[face] * mt->cpp;
+   }
+   else if (mt->target == PIPE_TEXTURE_3D) {
+      offset += mt->level[level].image_offset[zslice] * mt->cpp;
+   }
+   else {
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   ps = pipe->winsys->surface_alloc(pipe->winsys, mt->format);
+   if (ps) {
+      assert(ps->format);
+      assert(ps->refcount);
+      pipe_region_reference(&ps->region, mt->region);
+      ps->width = mt->level[level].width;
+      ps->height = mt->level[level].height;
+      ps->offset = offset;
+   }
+   return ps;
+}
+
+
+void
+nv40_init_surface_functions(struct nv40_context *nv40)
+{
+   nv40->pipe.get_tex_surface = nv40_get_tex_surface;
+   nv40->pipe.get_tile = nv40_get_tile;
+   nv40->pipe.put_tile = nv40_put_tile;
+   nv40->pipe.get_tile_rgba = nv40_get_tile_rgba;
+   nv40->pipe.put_tile_rgba = nv40_put_tile_rgba;
+}
diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
new file mode 100644
index 0000000000..aa930476b6
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -0,0 +1,222 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+#include "nv40_state.h"
+#include "nvgl_pipe.h"
+
+boolean
+nv40_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
+		 unsigned count)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	unsigned nr;
+
+	if (nv40->dirty)
+		nv40_emit_hw_state(nv40);
+
+	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+	OUT_RING  (nvgl_primitive(mode));
+
+	nr = (count & 0xff);
+	if (nr) {
+		BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
+		OUT_RING  (((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	/*XXX: large arrays (nr>2047) will blow up */
+	nr = count >> 8;
+	if (nr) {
+		assert (nr <= 2047);
+
+		BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, nr);
+		while (nr--) {
+			OUT_RING(((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+
+	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+	OUT_RING  (0);
+
+	pipe->flush(pipe, PIPE_FLUSH_WAIT);
+	return TRUE;
+}
+
+static INLINE void
+nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
+		       unsigned start, unsigned count)
+{
+	uint8_t *elts = (uint8_t *)ib + start;
+	int push, i;
+
+	if (count & 1) {
+		BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (elts[0]);
+		elts++; count--;
+	}
+
+	while (count) {
+		push = MIN2(count, 2046);
+
+		BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((elts[i+1] << 16) | elts[i]);
+
+		count -= push;
+		elts  += push;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
+		       unsigned start, unsigned count)
+{
+	uint16_t *elts = (uint16_t *)ib + start;
+	int push, i;
+
+	if (count & 1) {
+		BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (elts[0]);
+		elts++; count--;
+	}
+
+	while (count) {
+		push = MIN2(count, 2046);
+
+		BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((elts[i+1] << 16) | elts[i]);
+
+		count -= push;
+		elts  += push;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
+		       unsigned start, unsigned count)
+{
+	uint32_t *elts = (uint32_t *)ib + start;
+	int push;
+
+	while (count) {
+		push = MIN2(count, 2047);
+
+		BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
+		OUT_RINGp    (elts, push);
+
+		count -= push;
+		elts  += push;
+	}
+}
+
+boolean
+nv40_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer_handle *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = (struct nv40_context *)pipe;
+	void *ib;
+
+	if (nv40->dirty)
+		nv40_emit_hw_state(nv40);
+
+	ib = pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+				      PIPE_BUFFER_FLAG_READ);
+	if (!ib) {
+		NOUVEAU_ERR("Couldn't map index buffer!!\n");
+		return FALSE;
+	}
+
+	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+	OUT_RING  (nvgl_primitive(mode));
+
+	switch (indexSize) {
+	case 1:
+		nv40_draw_elements_u08(nv40, ib, start, count);
+		break;
+	case 2:
+		nv40_draw_elements_u16(nv40, ib, start, count);
+		break;
+	case 4:
+		nv40_draw_elements_u32(nv40, ib, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("unsupported elt size %d\n", indexSize);
+		break;
+	}
+
+	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+	OUT_RING  (0);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, ib);
+	pipe->flush(pipe, PIPE_FLUSH_WAIT);
+	return TRUE;
+}
+
+static INLINE int
+nv40_vbo_format_to_ncomp(uint format)
+{
+	switch (format) {
+	case PIPE_FORMAT_R32G32B32A32_FLOAT: return 4;
+	case PIPE_FORMAT_R32G32B32_FLOAT: return 3;
+	case PIPE_FORMAT_R32G32_FLOAT: return 2;
+	case PIPE_FORMAT_R32_FLOAT: return 1;
+	default:
+	       NOUVEAU_ERR("AII, unknown vbo format %d\n", format);
+	       return 1;
+	}
+}
+
+void
+nv40_vbo_arrays_update(struct nv40_context *nv40)
+{
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct nv40_vertex_program *vp = nv40->vertprog.vp;
+	uint32_t inputs, vtxfmt[16];
+	int hw, num_hw;
+
+	inputs = vp->ir;
+	for (hw = 0; hw < 16 && inputs; hw++) {
+		if (inputs & (1 << hw)) {
+			num_hw = hw;
+			inputs &= ~(1 << hw);
+		}
+	}
+	num_hw++;
+
+	inputs = vp->ir;
+	BEGIN_RING(curie, NV40TCL_VTXBUF_ADDRESS(0), num_hw);
+	for (hw = 0; hw < num_hw; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+
+		if (!(inputs & (1 << hw))) {
+			OUT_RING(0);
+			vtxfmt[hw] = NV40TCL_VTXFMT_TYPE_FLOAT;
+			continue;
+		}
+
+		ve = &nv40->vtxelt[hw];
+		vb = &nv40->vtxbuf[ve->vertex_buffer_index];
+
+		OUT_RELOC(vb->buffer, vb->buffer_offset + ve->src_offset,
+			  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+			  NOUVEAU_BO_OR | NOUVEAU_BO_RD, 0,
+			  NV40TCL_VTXBUF_ADDRESS_DMA1);
+		vtxfmt[hw] = ((vb->pitch << NV40TCL_VTXFMT_STRIDE_SHIFT) |
+			      (nv40_vbo_format_to_ncomp(ve->src_format) <<
+			       NV40TCL_VTXFMT_SIZE_SHIFT) |
+			      NV40TCL_VTXFMT_TYPE_FLOAT);
+	}
+
+	BEGIN_RING(curie, 0x1710, 1);
+	OUT_RING  (0); /* vtx cache flush */
+	BEGIN_RING(curie, NV40TCL_VTXFMT(0), num_hw);
+	OUT_RINGp (vtxfmt, num_hw);
+}
+
diff --git a/src/mesa/pipe/nv40/nv40_vertprog.c b/src/mesa/pipe/nv40/nv40_vertprog.c
new file mode 100644
index 0000000000..be550e4743
--- /dev/null
+++ b/src/mesa/pipe/nv40/nv40_vertprog.c
@@ -0,0 +1,594 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/tgsi/exec/tgsi_token.h"
+#include "pipe/tgsi/exec/tgsi_parse.h"
+
+#include "nv40_context.h"
+#include "nv40_dma.h"
+#include "nv40_state.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+
+static uint32_t
+passthrough_vp_data[] = {
+	0x40041c6c, 0x0040010d, 0x8106c083, 0x6041ff84,
+	0x40041c6c, 0x0040000d, 0x8106c083, 0x6041ff81,
+};
+
+static struct nv40_vertex_program
+passthrough_vp = {
+	.pipe = NULL,
+	.translated = TRUE,
+	
+	.insn     = passthrough_vp_data,
+	.insn_len = sizeof(passthrough_vp_data) / sizeof(uint32_t),
+
+	.ir = 0x00000003,
+	.or = 0x00000001,
+};
+
+struct nv40_vpc {
+	struct nv40_vertex_program *vp;
+
+	uint output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+};
+
+static INLINE struct nv40_sreg
+nv40_sr_temp(struct nv40_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp;
+	return nv40_sr(0, NV40_VP_SRC_REG_TYPE_TEMP, idx);
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_const(struct nv40_vpc *vpc, int pipe,
+	      float x, float y, float z, float w)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	int idx = vp->num_consts;
+
+	vp->consts[idx].pipe_id  = pipe;
+	vp->consts[idx].hw_id    = idx;
+	vp->consts[idx].value[0] = x;
+	vp->consts[idx].value[1] = y;
+	vp->consts[idx].value[2] = z;
+	vp->consts[idx].value[3] = w;
+	vp->num_consts++;
+
+	return nv40_sr(0, NV40_VP_SRC_REG_TYPE_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+#define temp(vpc) nv40_sr_temp((vpc))
+#define constant(v,p,x,y,z,w) nv40_sr_const((v), (p), (x), (y), (z), (w))
+
+static void
+emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	sr |= (src.type << NV40_VP_SRC_REG_TYPE_SHIFT);
+	if (src.type == NV40_VP_SRC_REG_TYPE_INPUT) {
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
+	} else
+	if (src.type == NV40_VP_SRC_REG_TYPE_CONST) {
+		hw[1] |= (src.index << NV40_VP_INST_CONST_SRC_SHIFT);
+	} else {
+		sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
+	}
+
+	if (src.negate)
+		sr |= NV40_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
+			  NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
+			  NV40_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
+			  NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
+			  NV40_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+
+	if (dst.output == 0) {
+		hw[3] |= NV40_VP_INST_DEST_MASK;
+		if (slot == 0) {
+			hw[0] |= (dst.index <<
+				  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+		} else {
+			hw[3] |= (dst.index <<
+				  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+		}
+	} else {
+		switch (dst.index) {
+		case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+		     break;
+		}
+
+		hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+		if (slot == 0) {
+			hw[0] |= NV40_VP_INST_VEC_RESULT;
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
+		} else {
+			hw[3] |= NV40_VP_INST_SCA_RESULT;
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+	}
+}
+
+static void
+nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1,
+	      struct nv40_sreg s2)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t *hw = &vp->insn[vp->insn_len];
+
+	hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
+
+	if (slot == 0) {
+		hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+		hw[3] |= (NV40_VP_INST_SCA_RESULT |
+			  NV40_VP_INST_SCA_DEST_TEMP_MASK);
+		hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	} else {
+		hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+		hw[1] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+		hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+
+	vp->insn_len += 4;
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(0, NV40_VP_SRC_REG_TYPE_INPUT,
+			      fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv40_sr(0, NV40_VP_SRC_REG_TYPE_TEMP,
+			      fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	uint out, idx;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		out = 1;
+		idx = vpc->output_map[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		out = 0;
+		idx = fdst->DstRegister.Index;
+		if (vpc->high_temp < idx)
+			vpc->high_temp = idx;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return nv40_sr(out, NV40_VP_SRC_REG_TYPE_TEMP, idx);
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv40_sreg src[3], dst, tmp;
+	struct nv40_sreg none = nv40_sr(0, NV40_VP_SRC_REG_TYPE_INPUT, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_RET)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				NOUVEAU_MSG("extra src attr %d\n",
+					 fsrc->SrcRegister.Index);
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV40_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->u.DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+void
+nv40_vertprog_translate(struct nv40_context *nv40,
+			struct nv40_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_vpc *vpc = NULL;
+
+	vpc = calloc(1, sizeof(struct nv40_vpc));
+	if (!vpc)
+		return;
+	vp->insn = calloc(1, 128*4*sizeof(uint32_t));
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	tgsi_parse_init(&parse, vp->pipe->tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insn[vp->insn_len - 1] |= NV40_VP_INST_LAST;
+#if 0
+	{
+		int i;
+		for (i = 0; i < vp->insn_len; i++)
+			NOUVEAU_ERR("inst[%d] = 0x%08x\n", i, vp->insn[i]);
+	}
+#endif
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	free(vpc);
+}
+
+void
+nv40_vertprog_bind(struct nv40_context *nv40, struct nv40_vertex_program *vp)
+{ 
+	int i;
+
+	if (!vp->translated) {
+		NOUVEAU_ERR("vtxprog invalid, using passthrough shader\n");
+		vp = &passthrough_vp;
+	}
+
+	if (!vp->on_hw) {
+		if (nv40->vertprog.active_vp)
+			nv40->vertprog.active_vp->on_hw = FALSE;
+		vp->on_hw    = TRUE;
+		vp->start_ip = 0;
+
+		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->start_ip);
+		for (i = 0; i < vp->insn_len; i += 4) {
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (&vp->insn[i], 4);
+		}
+	}
+
+	BEGIN_RING(curie, NV40TCL_VP_START_FROM_ID, 1);
+	OUT_RING  (vp->start_ip);
+	BEGIN_RING(curie, NV40TCL_VP_ATTRIB_EN, 2);
+	OUT_RING  (vp->ir);
+	OUT_RING  (vp->or);
+
+	nv40->vertprog.active_vp = vp;
+}
diff --git a/src/mesa/pipe/nv40/nvgl_pipe.h b/src/mesa/pipe/nv40/nvgl_pipe.h
new file mode 100644
index 0000000000..15ff318023
--- /dev/null
+++ b/src/mesa/pipe/nv40/nvgl_pipe.h
@@ -0,0 +1,198 @@
+#ifndef __NVGL_PIPE_H__
+#define __NVGL_PIPE_H__
+
+#include <GL/gl.h>
+
+static INLINE unsigned
+nvgl_blend_func(unsigned factor)
+{
+	switch (factor) {
+	case PIPE_BLENDFACTOR_ONE:
+		return GL_ONE;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return GL_SRC_COLOR;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return GL_SRC_ALPHA;
+	case PIPE_BLENDFACTOR_DST_ALPHA:
+		return GL_DST_ALPHA;
+	case PIPE_BLENDFACTOR_DST_COLOR:
+		return GL_DST_COLOR;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return GL_SRC_ALPHA_SATURATE;
+	case PIPE_BLENDFACTOR_CONST_COLOR:
+		return GL_CONSTANT_COLOR;
+	case PIPE_BLENDFACTOR_CONST_ALPHA:
+		return GL_CONSTANT_ALPHA;
+	case PIPE_BLENDFACTOR_ZERO:
+		return GL_ZERO;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return GL_ONE_MINUS_SRC_COLOR;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return GL_ONE_MINUS_SRC_ALPHA;
+	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+		return GL_ONE_MINUS_DST_ALPHA;
+	case PIPE_BLENDFACTOR_INV_DST_COLOR:
+		return GL_ONE_MINUS_DST_COLOR;
+	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+		return GL_ONE_MINUS_CONSTANT_COLOR;
+	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+		return GL_ONE_MINUS_CONSTANT_ALPHA;
+	default:
+		return GL_ONE;
+	}
+}
+
+static INLINE unsigned
+nvgl_blend_eqn(unsigned func)
+{
+	switch (func) {
+	case PIPE_BLEND_ADD:
+		return GL_FUNC_ADD;
+	case PIPE_BLEND_SUBTRACT:
+		return GL_FUNC_SUBTRACT;
+	case PIPE_BLEND_REVERSE_SUBTRACT:
+		return GL_FUNC_REVERSE_SUBTRACT;
+	case PIPE_BLEND_MIN:
+		return GL_MIN;
+	case PIPE_BLEND_MAX:
+		return GL_MAX;
+	default:
+		return GL_FUNC_ADD;
+	}
+}
+
+static INLINE unsigned
+nvgl_logicop_func(unsigned func)
+{
+	switch (func) {
+	case PIPE_LOGICOP_CLEAR:
+		return GL_CLEAR;
+	case PIPE_LOGICOP_NOR:
+		return GL_NOR;
+	case PIPE_LOGICOP_AND_INVERTED:
+		return GL_AND_INVERTED;
+	case PIPE_LOGICOP_COPY_INVERTED:
+		return GL_COPY_INVERTED;
+	case PIPE_LOGICOP_AND_REVERSE:
+		return GL_AND_REVERSE;
+	case PIPE_LOGICOP_INVERT:
+		return GL_INVERT;
+	case PIPE_LOGICOP_XOR:
+		return GL_XOR;
+	case PIPE_LOGICOP_NAND:
+		return GL_NAND;
+	case PIPE_LOGICOP_AND:
+		return GL_AND;
+	case PIPE_LOGICOP_EQUIV:
+		return GL_EQUIV;
+	case PIPE_LOGICOP_NOOP:
+		return GL_NOOP;
+	case PIPE_LOGICOP_OR_INVERTED:
+		return GL_OR_INVERTED;
+	case PIPE_LOGICOP_COPY:
+		return GL_COPY;
+	case PIPE_LOGICOP_OR_REVERSE:
+		return GL_OR_REVERSE;
+	case PIPE_LOGICOP_OR:
+		return GL_OR;
+	case PIPE_LOGICOP_SET:
+		return GL_SET;
+	default:
+		return GL_CLEAR;
+	}
+}
+
+static INLINE unsigned
+nvgl_comparison_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_FUNC_NEVER:
+		return GL_NEVER;
+	case PIPE_FUNC_LESS:
+		return GL_LESS;
+	case PIPE_FUNC_EQUAL:
+		return GL_EQUAL;
+	case PIPE_FUNC_LEQUAL:
+		return GL_LEQUAL;
+	case PIPE_FUNC_GREATER:
+		return GL_GREATER;
+	case PIPE_FUNC_NOTEQUAL:
+		return GL_NOTEQUAL;
+	case PIPE_FUNC_GEQUAL:
+		return GL_GEQUAL;
+	case PIPE_FUNC_ALWAYS:
+		return GL_ALWAYS;
+	default:
+		return GL_NEVER;
+	}
+}
+
+static INLINE unsigned
+nvgl_polygon_mode(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_POLYGON_MODE_FILL:
+		return GL_FILL;
+	case PIPE_POLYGON_MODE_LINE:
+		return GL_LINE;
+	case PIPE_POLYGON_MODE_POINT:
+		return GL_POINT;
+	default:
+		return GL_FILL;
+	}
+}
+
+static INLINE unsigned
+nvgl_stencil_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_STENCIL_OP_KEEP:
+		return GL_KEEP;
+	case PIPE_STENCIL_OP_ZERO:
+		return GL_ZERO;
+	case PIPE_STENCIL_OP_REPLACE:
+		return GL_REPLACE;
+	case PIPE_STENCIL_OP_INCR:
+		return GL_INCR;
+	case PIPE_STENCIL_OP_DECR:
+		return GL_DECR;
+	case PIPE_STENCIL_OP_INCR_WRAP:
+		return GL_INCR_WRAP;
+	case PIPE_STENCIL_OP_DECR_WRAP:
+		return GL_DECR_WRAP;
+	case PIPE_STENCIL_OP_INVERT:
+		return GL_INVERT;
+	default:
+		return GL_KEEP;
+	}
+}
+
+static INLINE unsigned
+nvgl_primitive(unsigned prim) {
+	switch (prim) {
+	case PIPE_PRIM_POINTS:
+		return GL_POINTS + 1;
+	case PIPE_PRIM_LINES:
+		return GL_LINES + 1;
+	case PIPE_PRIM_LINE_LOOP:
+		return GL_LINE_LOOP + 1;
+	case PIPE_PRIM_LINE_STRIP:
+		return GL_LINE_STRIP + 1;
+	case PIPE_PRIM_TRIANGLES:
+		return GL_TRIANGLES + 1;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return GL_TRIANGLE_STRIP + 1;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		return GL_TRIANGLE_FAN + 1;
+	case PIPE_PRIM_QUADS:
+		return GL_QUADS + 1;
+	case PIPE_PRIM_QUAD_STRIP:
+		return GL_QUAD_STRIP + 1;
+	case PIPE_PRIM_POLYGON:
+		return GL_POLYGON + 1;
+	default:
+		return GL_POINTS + 1;
+	}
+}
+
+#endif
author	Ben Skeggs <skeggsb@gmail.com>	2007-11-18 17:08:06 +1100
committer	Ben Skeggs <skeggsb@gmail.com>	2007-11-18 17:34:06 +1100
commit	2f33b5b56e9221f2613b34cd1a1a9d82d5ed4303 (patch)
tree	9bcdd27b60eaf4c3d608b4dd2f582fcee7c39f11 /src/mesa/pipe/nv40
parent	193c85ec7a1aec44eebc67c6224fb6ecbb4607a5 (diff)