From 5351e7270f71dbcea9f790d4904327da87f931ba Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 3 Feb 2008 23:08:06 +0100
Subject: Add NV63.

---
 src/mesa/pipe/nv40/nv40_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/nv40/nv40_context.c b/src/mesa/pipe/nv40/nv40_context.c
index d0d9583e32..1351a79fe0 100644
--- a/src/mesa/pipe/nv40/nv40_context.c
+++ b/src/mesa/pipe/nv40/nv40_context.c
@@ -190,7 +190,7 @@ nv40_init_hwctx(struct nv40_context *nv40, int curie_class)
 
 #define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
 #define NV4X_GRCLASS4497_CHIPSETS 0x00005450
-#define NV6X_GRCLASS4497_CHIPSETS 0x00000080
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
 
 struct pipe_context *
 nv40_create(struct pipe_winsys *pipe_winsys, struct nouveau_winsys *nvws,
-- 
cgit v1.2.3


From f71400876b1469ce7b080cbddb5dde4f61ed78b7 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Mon, 4 Feb 2008 17:31:45 +0100
Subject: nouveau: update to latest header.

---
 src/mesa/pipe/nouveau/nouveau_class.h | 53 +++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/src/mesa/pipe/nouveau/nouveau_class.h b/src/mesa/pipe/nouveau/nouveau_class.h
index e3c284095d..95f646a991 100644
--- a/src/mesa/pipe/nouveau/nouveau_class.h
+++ b/src/mesa/pipe/nouveau/nouveau_class.h
@@ -15,10 +15,10 @@
    Stephane Marchesin,
    Serge Martin,
    Sylvain Munaut,
+   Simon Raffeiner,
    Ben Skeggs,
    Erik Waling,
    koala_br,
-   sturmflut.
 
 All Rights Reserved.
 
@@ -1390,10 +1390,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_MASK				0x70000000
 #define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST				0x10000000
 #define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR				0x20000000
-#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST_MIPMAP_NEAREST		0x30000000
-#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR_MIPMAP_NEAREST		0x40000000
-#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST_MIPMAP_LINEAR		0x50000000
-#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR_MIPMAP_LINEAR		0x60000000
 #define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE			(1 << 31)
 #define  NV04_DX5_TEXTURED_TRIANGLE_BLEND						0x00000310
 #define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_MAP_SHIFT				0
@@ -1853,10 +1849,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV10TCL_TX_FILTER_MAGNIFY_MASK						0xf0000000
 #define    NV10TCL_TX_FILTER_MAGNIFY_NEAREST						0x10000000
 #define    NV10TCL_TX_FILTER_MAGNIFY_LINEAR						0x20000000
-#define    NV10TCL_TX_FILTER_MAGNIFY_NEAREST_MIPMAP_NEAREST				0x30000000
-#define    NV10TCL_TX_FILTER_MAGNIFY_LINEAR_MIPMAP_NEAREST				0x40000000
-#define    NV10TCL_TX_FILTER_MAGNIFY_NEAREST_MIPMAP_LINEAR				0x50000000
-#define    NV10TCL_TX_FILTER_MAGNIFY_LINEAR_MIPMAP_LINEAR				0x60000000
 #define  NV10TCL_TX_PALETTE_OFFSET(x)							(0x00000250+((x)*4))
 #define  NV10TCL_TX_PALETTE_OFFSET__SIZE						0x00000002
 #define  NV10TCL_RC_IN_ALPHA(x)								(0x00000260+((x)*4))
@@ -4033,6 +4025,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_FOG_EQUATION_LINEAR							0x000008d4
 #define  NV34TCL_FOG_EQUATION_QUADRATIC							0x000008d8
 #define  NV34TCL_FP_ACTIVE_PROGRAM							0x000008e4
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA0						(1 <<  0)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA1						(1 <<  1)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_SHIFT					2
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_MASK						0xfffffffc
 #define  NV34TCL_RC_COLOR0								0x000008ec
 #define   NV34TCL_RC_COLOR0_B_SHIFT							0
 #define   NV34TCL_RC_COLOR0_B_MASK							0x000000ff
@@ -4187,8 +4183,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_FRONT_MATERIAL_SHININESS(x)						(0x00001400+((x)*4))
 #define  NV34TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
 #define  NV34TCL_FP_REG_CONTROL								0x00001450
-#define   NV34TCL_FP_REG_CONTROL_USED_REGS_SHIFT					16
-#define   NV34TCL_FP_REG_CONTROL_USED_REGS_MASK						0xffff0000
+#define   NV34TCL_FP_REG_CONTROL_UNK1_SHIFT						16
+#define   NV34TCL_FP_REG_CONTROL_UNK1_MASK						0xffff0000
+#define   NV34TCL_FP_REG_CONTROL_UNK0_SHIFT						0
+#define   NV34TCL_FP_REG_CONTROL_UNK0_MASK						0x0000ffff
 #define  NV34TCL_VP_CLIP_PLANES_ENABLE							0x00001478
 #define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0						(1 <<  1)
 #define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1						(1 <<  5)
@@ -4331,9 +4329,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_TX_OFFSET__SIZE							0x00000004
 #define  NV34TCL_TX_FORMAT(x)								(0x00001a04+((x)*32))
 #define  NV34TCL_TX_FORMAT__SIZE							0x00000004
+#define   NV34TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV34TCL_TX_FORMAT_DMA1							(1 <<  1)
 #define   NV34TCL_TX_FORMAT_CUBE_MAP							(1 <<  2)
-#define   NV34TCL_TX_FORMAT_COMPONENTS_SHIFT						4
-#define   NV34TCL_TX_FORMAT_COMPONENTS_MASK						0x000000f0
+#define   NV34TCL_TX_FORMAT_DIMS_SHIFT							4
+#define   NV34TCL_TX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV34TCL_TX_FORMAT_DIMS_1D							0x00000010
+#define    NV34TCL_TX_FORMAT_DIMS_2D							0x00000020
+#define    NV34TCL_TX_FORMAT_DIMS_3D							0x00000030
 #define   NV34TCL_TX_FORMAT_FORMAT_SHIFT						8
 #define   NV34TCL_TX_FORMAT_FORMAT_MASK							0x0000ff00
 #define    NV34TCL_TX_FORMAT_FORMAT_L8							0x00000000
@@ -4458,10 +4461,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV34TCL_TX_FILTER_MAGNIFY_MASK						0x0f000000
 #define    NV34TCL_TX_FILTER_MAGNIFY_NEAREST						0x01000000
 #define    NV34TCL_TX_FILTER_MAGNIFY_LINEAR						0x02000000
-#define    NV34TCL_TX_FILTER_MAGNIFY_NEAREST_MIPMAP_NEAREST				0x03000000
-#define    NV34TCL_TX_FILTER_MAGNIFY_LINEAR_MIPMAP_NEAREST				0x04000000
-#define    NV34TCL_TX_FILTER_MAGNIFY_NEAREST_MIPMAP_LINEAR				0x05000000
-#define    NV34TCL_TX_FILTER_MAGNIFY_LINEAR_MIPMAP_LINEAR				0x06000000
+#define   NV34TCL_TX_FILTER_SIGNED_BLUE							(1 << 28)
+#define   NV34TCL_TX_FILTER_SIGNED_GREEN						(1 << 29)
+#define   NV34TCL_TX_FILTER_SIGNED_RED							(1 << 30)
+#define   NV34TCL_TX_FILTER_SIGNED_ALPHA						(1 << 31)
 #define  NV34TCL_TX_NPOT_SIZE(x)							(0x00001a18+((x)*32))
 #define  NV34TCL_TX_NPOT_SIZE__SIZE							0x00000004
 #define   NV34TCL_TX_NPOT_SIZE_H_SHIFT							0
@@ -4488,6 +4491,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_VERTEX_ATTR_4F_W__SIZE							0x00000010
 #define  NV34TCL_FP_CONTROL								0x00001d60
 #define   NV34TCL_FP_CONTROL_USES_KIL							(1 <<  7)
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_SHIFT				0
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_MASK					0x0000000f
 #define  NV34TCL_MULTISAMPLE_CONTROL							0x00001d7c
 #define  NV34TCL_CLEAR_DEPTH_VALUE							0x00001d8c
 #define  NV34TCL_CLEAR_COLOR_VALUE							0x00001d90
@@ -5074,6 +5079,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define    NV40TCL_TEX_FORMAT_FORMAT_Z24						0x00001000
 #define    NV40TCL_TEX_FORMAT_FORMAT_Z16						0x00001200
 #define    NV40TCL_TEX_FORMAT_FORMAT_HILO8						0x00001800
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA16F						0x00001a00
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA32F						0x00001b00
 #define   NV40TCL_TEX_FORMAT_DIMS_SHIFT							4
 #define   NV40TCL_TEX_FORMAT_DIMS_MASK							0x000000f0
 #define    NV40TCL_TEX_FORMAT_DIMS_1D							0x00000010
@@ -5202,10 +5209,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV40TCL_TEX_FILTER_MAG_MASK							0x0f000000
 #define    NV40TCL_TEX_FILTER_MAG_NEAREST						0x01000000
 #define    NV40TCL_TEX_FILTER_MAG_LINEAR						0x02000000
-#define    NV40TCL_TEX_FILTER_MAG_NEAREST_MIPMAP_NEAREST				0x03000000
-#define    NV40TCL_TEX_FILTER_MAG_LINEAR_MIPMAP_NEAREST					0x04000000
-#define    NV40TCL_TEX_FILTER_MAG_NEAREST_MIPMAP_LINEAR					0x05000000
-#define    NV40TCL_TEX_FILTER_MAG_LINEAR_MIPMAP_LINEAR					0x06000000
 #define  NV40TCL_TEX_SIZE0(x)								(0x00001a18+((x)*32))
 #define  NV40TCL_TEX_SIZE0__SIZE							0x00000010
 #define   NV40TCL_TEX_SIZE0_H_SHIFT							0
@@ -6064,6 +6067,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50TCL_CLEAR_BUFFERS								0x000019d0
 #define  NV50TCL_COLOR_MASK(x)								(0x00001a00+((x)*4))
 #define  NV50TCL_COLOR_MASK__SIZE							0x00000008
+#define   NV50TCL_COLOR_MASK_R_SHIFT							0
+#define   NV50TCL_COLOR_MASK_R_MASK							0x0000000f
+#define   NV50TCL_COLOR_MASK_G_SHIFT							4
+#define   NV50TCL_COLOR_MASK_G_MASK							0x000000f0
+#define   NV50TCL_COLOR_MASK_B_SHIFT							8
+#define   NV50TCL_COLOR_MASK_B_MASK							0x00000f00
+#define   NV50TCL_COLOR_MASK_A_SHIFT							12
+#define   NV50TCL_COLOR_MASK_A_MASK							0x0000f000
 
 
 #define NV50_COMPUTE									0x000050c0
-- 
cgit v1.2.3


From 2effa9b36cc47ca3fc0acc21908f5bb132eca3e7 Mon Sep 17 00:00:00 2001
From: Maarten Maathuis <madman2003@gmail.com>
Date: Wed, 13 Feb 2008 22:57:09 +0100
Subject: nv40: Avoid a nasty array overflow leading to a corrupt memory
 pointer.

---
 src/mesa/pipe/nv40/nv40_fragprog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
index 714634396d..14897f9798 100644
--- a/src/mesa/pipe/nv40/nv40_fragprog.c
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -716,7 +716,7 @@ nv40_fragprog_translate(struct nv40_context *nv40,
 			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
 			assert(fpc->nr_imm < MAX_IMM);
 
-			for (i = 0; i < imm->Immediate.Size; i++)
+			for (i = 0; i < (imm->Immediate.Size - 1); i++)
 				vals[i] = imm->u.ImmediateFloat32[i].Float;
 			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
 		}
-- 
cgit v1.2.3


From 73b3a29b16fe5d798026db4eeabb8d33bb6c2cb0 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 10 Feb 2008 19:14:02 +0100
Subject: Hook nv30 into the build.

---
 src/mesa/pipe/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/Makefile b/src/mesa/pipe/Makefile
index 6012b2bcea..2b03b5c244 100644
--- a/src/mesa/pipe/Makefile
+++ b/src/mesa/pipe/Makefile
@@ -10,7 +10,7 @@ ifeq ($(CONFIG_NAME), linux-llvm)
 LLVM_DIR = llvm
 endif
 
-SUBDIRS = softpipe i915simple i965simple nv40 nv50 failover pipebuffer \
+SUBDIRS = softpipe i915simple i965simple nv30 nv40 nv50 failover pipebuffer \
 	  $(CELL_DIR) $(LLVM_DIR)
 
 
-- 
cgit v1.2.3


From e713cb26c9adeff4e35a9b2cac35e7c025ef72b4 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Fri, 15 Feb 2008 02:23:56 +0100
Subject: nouveau: Update to latest header.

---
 src/mesa/pipe/nouveau/nouveau_class.h | 46 +++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/src/mesa/pipe/nouveau/nouveau_class.h b/src/mesa/pipe/nouveau/nouveau_class.h
index 95f646a991..5998945677 100644
--- a/src/mesa/pipe/nouveau/nouveau_class.h
+++ b/src/mesa/pipe/nouveau/nouveau_class.h
@@ -3692,16 +3692,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_DMA_QUERY								0x000001a8
 #define  NV34TCL_DMA_IN_MEMORY7								0x000001ac
 #define  NV34TCL_DMA_IN_MEMORY8								0x000001b0
-#define  NV34TCL_VIEWPORT_HORIZ								0x00000200
-#define   NV34TCL_VIEWPORT_HORIZ_X_SHIFT						0
-#define   NV34TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
-#define   NV34TCL_VIEWPORT_HORIZ_W_SHIFT						16
-#define   NV34TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
-#define  NV34TCL_VIEWPORT_VERT								0x00000204
-#define   NV34TCL_VIEWPORT_VERT_Y_SHIFT							0
-#define   NV34TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
-#define   NV34TCL_VIEWPORT_VERT_H_SHIFT							16
-#define   NV34TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NV34TCL_RT_HORIZ								0x00000200
+#define   NV34TCL_RT_HORIZ_X_SHIFT							0
+#define   NV34TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_RT_HORIZ_W_SHIFT							16
+#define   NV34TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_RT_VERT								0x00000204
+#define   NV34TCL_RT_VERT_Y_SHIFT							0
+#define   NV34TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_RT_VERT_H_SHIFT							16
+#define   NV34TCL_RT_VERT_H_MASK							0xffff0000
 #define  NV34TCL_RT_FORMAT								0x00000208
 #define   NV34TCL_RT_FORMAT_TYPE_SHIFT							8
 #define   NV34TCL_RT_FORMAT_TYPE_MASK							0x00000f00
@@ -4078,17 +4078,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_RC_OUT_ALPHA__SIZE							0x00000008
 #define  NV34TCL_RC_OUT_RGB(x)								(0x00000914+((x)*32))
 #define  NV34TCL_RC_OUT_RGB__SIZE							0x00000008
+#define  NV34TCL_VIEWPORT_HORIZ								0x00000a00
+#define   NV34TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV34TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV34TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_VIEWPORT_VERT								0x00000a04
+#define   NV34TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV34TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV34TCL_VIEWPORT_VERT_H_MASK							0xffff0000
 #define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x00000a10
 #define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x00000a14
 #define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x00000a18
-#define  NV34TCL_VIEWPORT_SCALE0_X							0x00000a20
-#define  NV34TCL_VIEWPORT_SCALE0_Y							0x00000a24
-#define  NV34TCL_VIEWPORT_SCALE0_Z							0x00000a28
-#define  NV34TCL_VIEWPORT_SCALE0_W							0x00000a2c
-#define  NV34TCL_VIEWPORT_SCALE1_X							0x00000a30
-#define  NV34TCL_VIEWPORT_SCALE1_Y							0x00000a34
-#define  NV34TCL_VIEWPORT_SCALE1_Z							0x00000a38
-#define  NV34TCL_VIEWPORT_SCALE1_W							0x00000a3c
+#define  NV34TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV34TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV34TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV34TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV34TCL_VIEWPORT_SCALE_X							0x00000a30
+#define  NV34TCL_VIEWPORT_SCALE_Y							0x00000a34
+#define  NV34TCL_VIEWPORT_SCALE_Z							0x00000a38
+#define  NV34TCL_VIEWPORT_SCALE_W							0x00000a3c
 #define  NV34TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000a60
 #define  NV34TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000a64
 #define  NV34TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000a68
-- 
cgit v1.2.3


From e538dc52c13eb9225afd8cb6c1099e97e723a558 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Fri, 15 Feb 2008 02:25:17 +0100
Subject: nouveau: hook in nv30.

---
 src/mesa/drivers/dri/nouveau_winsys/Makefile         | 1 +
 src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys.c | 3 +++
 src/mesa/pipe/nouveau/nouveau_winsys.h               | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/src/mesa/drivers/dri/nouveau_winsys/Makefile b/src/mesa/drivers/dri/nouveau_winsys/Makefile
index f547ec4376..59ba561eb9 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/Makefile
+++ b/src/mesa/drivers/dri/nouveau_winsys/Makefile
@@ -8,6 +8,7 @@ MINIGLX_SOURCES =
 
 PIPE_DRIVERS = \
 	$(TOP)/src/mesa/pipe/softpipe/libsoftpipe.a \
+	$(TOP)/src/mesa/pipe/nv40/libnv30.a \
 	$(TOP)/src/mesa/pipe/nv40/libnv40.a \
 	$(TOP)/src/mesa/pipe/nv50/libnv50.a
 
diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys.c b/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys.c
index 8ffe89feec..1494bd48dd 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys.c
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys.c
@@ -80,6 +80,9 @@ nouveau_pipe_create(struct nouveau_context *nv)
 		return NULL;
 
 	switch (nv->chipset & 0xf0) {
+	case 0x30:
+		hw_create = nv30_create;
+		break;
 	case 0x40:
 	case 0x60:
 		hw_create = nv40_create;
diff --git a/src/mesa/pipe/nouveau/nouveau_winsys.h b/src/mesa/pipe/nouveau/nouveau_winsys.h
index 0b394cfce6..818ae9afae 100644
--- a/src/mesa/pipe/nouveau/nouveau_winsys.h
+++ b/src/mesa/pipe/nouveau/nouveau_winsys.h
@@ -49,6 +49,9 @@ struct nouveau_winsys {
 			    unsigned, unsigned, unsigned, unsigned, unsigned);
 };
 
+extern struct pipe_context *
+nv30_create(struct pipe_winsys *, struct nouveau_winsys *, unsigned chipset);
+
 extern struct pipe_context *
 nv40_create(struct pipe_winsys *, struct nouveau_winsys *, unsigned chipset);
 
-- 
cgit v1.2.3


From 583f424d61d8080079a55f3c962f647b795d9337 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Fri, 15 Feb 2008 02:36:28 +0100
Subject: nouveau: add nv30.

---
 src/mesa/pipe/nv30/Makefile          |  29 ++
 src/mesa/pipe/nv30/nv30_clear.c      |  12 +
 src/mesa/pipe/nv30/nv30_context.c    | 429 ++++++++++++++++++
 src/mesa/pipe/nv30/nv30_context.h    | 136 ++++++
 src/mesa/pipe/nv30/nv30_dma.h        |  66 +++
 src/mesa/pipe/nv30/nv30_draw.c       |  62 +++
 src/mesa/pipe/nv30/nv30_fragprog.c   | 834 +++++++++++++++++++++++++++++++++++
 src/mesa/pipe/nv30/nv30_fragtex.c    | 160 +++++++
 src/mesa/pipe/nv30/nv30_miptree.c    | 105 +++++
 src/mesa/pipe/nv30/nv30_query.c      | 112 +++++
 src/mesa/pipe/nv30/nv30_shader.h     | 490 ++++++++++++++++++++
 src/mesa/pipe/nv30/nv30_state.c      | 740 +++++++++++++++++++++++++++++++
 src/mesa/pipe/nv30/nv30_state.h      | 147 ++++++
 src/mesa/pipe/nv30/nv30_state_emit.c |  83 ++++
 src/mesa/pipe/nv30/nv30_surface.c    | 136 ++++++
 src/mesa/pipe/nv30/nv30_vbo.c        | 406 +++++++++++++++++
 src/mesa/pipe/nv30/nv30_vertprog.c   | 778 ++++++++++++++++++++++++++++++++
 17 files changed, 4725 insertions(+)
 create mode 100644 src/mesa/pipe/nv30/Makefile
 create mode 100644 src/mesa/pipe/nv30/nv30_clear.c
 create mode 100644 src/mesa/pipe/nv30/nv30_context.c
 create mode 100644 src/mesa/pipe/nv30/nv30_context.h
 create mode 100644 src/mesa/pipe/nv30/nv30_dma.h
 create mode 100644 src/mesa/pipe/nv30/nv30_draw.c
 create mode 100644 src/mesa/pipe/nv30/nv30_fragprog.c
 create mode 100644 src/mesa/pipe/nv30/nv30_fragtex.c
 create mode 100644 src/mesa/pipe/nv30/nv30_miptree.c
 create mode 100644 src/mesa/pipe/nv30/nv30_query.c
 create mode 100644 src/mesa/pipe/nv30/nv30_shader.h
 create mode 100644 src/mesa/pipe/nv30/nv30_state.c
 create mode 100644 src/mesa/pipe/nv30/nv30_state.h
 create mode 100644 src/mesa/pipe/nv30/nv30_state_emit.c
 create mode 100644 src/mesa/pipe/nv30/nv30_surface.c
 create mode 100644 src/mesa/pipe/nv30/nv30_vbo.c
 create mode 100644 src/mesa/pipe/nv30/nv30_vertprog.c

diff --git a/src/mesa/pipe/nv30/Makefile b/src/mesa/pipe/nv30/Makefile
new file mode 100644
index 0000000000..dd4b7e73cd
--- /dev/null
+++ b/src/mesa/pipe/nv30/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv30
+
+DRIVER_SOURCES = \
+	nv30_clear.c \
+	nv30_context.c \
+	nv30_draw.c \
+	nv30_fragprog.c \
+	nv30_fragtex.c \
+	nv30_miptree.c \
+	nv30_query.c \
+	nv30_state.c \
+	nv30_state_emit.c \
+	nv30_surface.c \
+	nv30_vbo.c \
+	nv30_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../Makefile.template
+
+symlinks:
+
diff --git a/src/mesa/pipe/nv30/nv30_clear.c b/src/mesa/pipe/nv30/nv30_clear.c
new file mode 100644
index 0000000000..71f413588e
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+
+void
+nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/mesa/pipe/nv30/nv30_context.c b/src/mesa/pipe/nv30/nv30_context.c
new file mode 100644
index 0000000000..c56f918ad9
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_context.c
@@ -0,0 +1,429 @@
+#include "pipe/draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_util.h"
+
+#include "nv30_context.h"
+
+static const char *
+nv30_get_name(struct pipe_context *pipe)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", nv30->chipset);
+	return buffer;
+}
+
+static const char *
+nv30_get_vendor(struct pipe_context *pipe)
+{
+	return "nouveau";
+}
+
+static int
+nv30_get_param(struct pipe_context *pipe, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv30_get_paramf(struct pipe_context *pipe, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static void
+nv30_flush(struct pipe_context *pipe, unsigned flags)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	if (flags & PIPE_FLUSH_WAIT) {
+		nvws->notifier_reset(nv30->sync, 0);
+		BEGIN_RING(rankine, 0x104, 1);
+		OUT_RING  (0);
+		BEGIN_RING(rankine, 0x100, 1);
+		OUT_RING  (0);
+	}
+
+	FIRE_RING();
+
+	if (flags & PIPE_FLUSH_WAIT)
+		nvws->notifier_wait(nv30->sync, 0, 0, 2000);
+}
+
+static void
+nv30_destroy(struct pipe_context *pipe)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	if (nv30->draw)
+		draw_destroy(nv30->draw);
+
+	nvws->res_free(&nv30->vertprog.exec_heap);
+	nvws->res_free(&nv30->vertprog.data_heap);
+
+	nvws->res_free(&nv30->query_heap);
+	nvws->notifier_free(&nv30->query);
+
+	nvws->notifier_free(&nv30->sync);
+
+	nvws->grobj_free(&nv30->rankine);
+
+	free(nv30);
+}
+
+static boolean
+nv30_init_hwctx(struct nv30_context *nv30, int rankine_class)
+{
+	struct nouveau_winsys *nvws = nv30->nvws;
+	int ret;
+	int i;
+
+	ret = nvws->grobj_alloc(nvws, rankine_class, &nv30->rankine);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	BEGIN_RING(rankine, NV34TCL_DMA_NOTIFY, 1);
+	OUT_RING  (nv30->sync->handle);
+	BEGIN_RING(rankine, NV34TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(rankine, NV34TCL_DMA_COLOR1, 1);
+	OUT_RING  (nvws->channel->vram->handle);
+	BEGIN_RING(rankine, NV34TCL_DMA_COLOR0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+	BEGIN_RING(rankine, NV34TCL_DMA_VTXBUF0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+/*	BEGIN_RING(rankine, NV34TCL_DMA_FENCE, 2);
+	OUT_RING  (0);
+	OUT_RING  (nv30->query->handle);*/
+	BEGIN_RING(rankine, NV34TCL_DMA_IN_MEMORY7, 1);
+	OUT_RING  (nvws->channel->vram->handle);
+	BEGIN_RING(rankine, NV34TCL_DMA_IN_MEMORY8, 1);
+	OUT_RING  (nvws->channel->vram->handle);
+
+	for (i=1; i<8; i++) {
+		BEGIN_RING(rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(rankine, NV34TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(rankine, 0x220, 1);
+	OUT_RING  (1);
+
+	BEGIN_RING(rankine, 0x03b0, 1);
+	OUT_RING  (0x00100000);
+	BEGIN_RING(rankine, 0x1454, 1);
+	OUT_RING  (0);
+	BEGIN_RING(rankine, 0x1d80, 1);
+	OUT_RING  (3);
+	BEGIN_RING(rankine, 0x1450, 1);
+	OUT_RING  (0x00030004);
+	
+	/* NEW */
+	BEGIN_RING(rankine, 0x1e98, 1);
+	OUT_RING  (0);
+	BEGIN_RING(rankine, 0x17e0, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x3f800000);
+	BEGIN_RING(rankine, 0x1f80, 16);
+	OUT_RING  (0); OUT_RING  (0); OUT_RING  (0); OUT_RING  (0); 
+	OUT_RING  (0); OUT_RING  (0); OUT_RING  (0); OUT_RING  (0); 
+	OUT_RING  (0x0000ffff);
+	OUT_RING  (0); OUT_RING  (0); OUT_RING  (0); OUT_RING  (0); 
+	OUT_RING  (0); OUT_RING  (0); OUT_RING  (0); 
+
+	BEGIN_RING(rankine, 0x120, 3);
+	OUT_RING  (0);
+	OUT_RING  (1);
+	OUT_RING  (2);
+
+	BEGIN_RING(rankine, 0x1d88, 1);
+	OUT_RING  (0x00001200);
+
+	BEGIN_RING(rankine, NV34TCL_RC_ENABLE, 1);
+	OUT_RING  (0);
+
+	/* Attempt to setup a known state.. Probably missing a heap of
+	 * stuff here..
+	 */
+	BEGIN_RING(rankine, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(rankine, NV34TCL_STENCIL_BACK_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(rankine, NV34TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(rankine, NV34TCL_DEPTH_WRITE_ENABLE, 2);
+	OUT_RING  (0); /* wr disable */
+	OUT_RING  (0); /* test disable */
+	BEGIN_RING(rankine, NV34TCL_COLOR_MASK, 1);
+	OUT_RING  (0x01010101); /* TR,TR,TR,TR */
+	BEGIN_RING(rankine, NV34TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(rankine, NV34TCL_BLEND_FUNC_ENABLE, 5);
+	OUT_RING  (0);				/* Blend enable */
+	OUT_RING  (0);				/* Blend src */
+	OUT_RING  (0);				/* Blend dst */
+	OUT_RING  (0x00000000);			/* Blend colour */
+	OUT_RING  (0x8006);			/* FUNC_ADD */
+	BEGIN_RING(rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0x1503 /*GL_COPY*/);
+	BEGIN_RING(rankine, NV34TCL_DITHER_ENABLE, 1);
+	OUT_RING  (1);
+	BEGIN_RING(rankine, NV34TCL_SHADE_MODEL, 1);
+	OUT_RING  (0x1d01 /*GL_SMOOTH*/);
+	BEGIN_RING(rankine, NV34TCL_POLYGON_OFFSET_FACTOR,2);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	BEGIN_RING(rankine, NV34TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (0x1b02 /*GL_FILL*/);
+	OUT_RING  (0x1b02 /*GL_FILL*/);
+	/* - Disable texture units
+	 * - Set fragprog to MOVR result.color, fragment.color */
+	for (i=0;i<16;i++) {
+		BEGIN_RING(rankine,
+				NV34TCL_TX_ENABLE(i), 1);
+		OUT_RING  (0);
+	}
+	/* Polygon stipple */
+	BEGIN_RING(rankine,
+			NV34TCL_POLYGON_STIPPLE_PATTERN(0), 0x20);
+	for (i=0;i<0x20;i++)
+		OUT_RING  (0xFFFFFFFF);
+
+	int w=4096;
+	int h=4096;
+	int pitch=4096*4;
+	BEGIN_RING(rankine, NV34TCL_VIEWPORT_HORIZ, 5);
+	OUT_RING  (w<<16);
+	OUT_RING  (h<<16);
+	OUT_RING  (0x148); /* format */
+	OUT_RING  (pitch << 16 | pitch);
+	OUT_RING  (0x0);
+        BEGIN_RING(rankine, 0x0a00, 2);
+        OUT_RING  ((w<<16) | 0);
+        OUT_RING  ((h<<16) | 0);
+	BEGIN_RING(rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  ((w-1)<<16);
+	OUT_RING  ((h-1)<<16);
+	BEGIN_RING(rankine, NV34TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (w<<16);
+	OUT_RING  (h<<16);
+	BEGIN_RING(rankine, NV34TCL_VIEWPORT_HORIZ, 2);
+	OUT_RING  (w<<16);
+	OUT_RING  (h<<16);
+
+	BEGIN_RING(rankine, NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+
+	BEGIN_RING(rankine, NV34TCL_MODELVIEW_MATRIX(0), 16);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+
+	BEGIN_RING(rankine, NV34TCL_PROJECTION_MATRIX(0), 16);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+
+	BEGIN_RING(rankine, NV34TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (4096<<16);
+	OUT_RING  (4096<<16);
+
+	BEGIN_RING(rankine, NV34TCL_MULTISAMPLE_CONTROL, 1);
+	OUT_RING  (0xffff0000);
+
+	FIRE_RING ();
+	return TRUE;
+}
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+struct pipe_context *
+nv30_create(struct pipe_winsys *pipe_winsys, struct nouveau_winsys *nvws,
+	    unsigned chipset)
+{
+	struct nv30_context *nv30;
+	int rankine_class = 0, ret;
+
+	if ((chipset & 0xf0) != 0x30) {
+		NOUVEAU_ERR("Not a NV3X chipset\n");
+		return NULL;
+	}
+
+	if (NV30TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f))) {
+		rankine_class = 0x0397;
+	} else if (NV34TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f))) {
+		rankine_class = 0x0697;
+	} else if (NV35TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f))) {
+		rankine_class = 0x0497;
+	} else {
+		NOUVEAU_ERR("Unknown NV3X chipset: NV%02x\n", chipset);
+		return NULL;
+	}
+
+	nv30 = CALLOC_STRUCT(nv30_context);
+	if (!nv30)
+		return NULL;
+	nv30->chipset = chipset;
+	nv30->nvws = nvws;
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &nv30->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv30_destroy(&nv30->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &nv30->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv30_destroy(&nv30->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&nv30->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv30_destroy(&nv30->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&nv30->vertprog.exec_heap, 0, 512) ||
+	    nvws->res_init(&nv30->vertprog.data_heap, 0, 256)) {
+		nv30_destroy(&nv30->pipe);
+		return NULL;
+	}
+
+	/* Static rankine initialisation */
+	if (!nv30_init_hwctx(nv30, rankine_class)) {
+		nv30_destroy(&nv30->pipe);
+		return NULL;
+	}
+
+	/* Pipe context setup */
+	nv30->pipe.winsys = pipe_winsys;
+
+	nv30->pipe.destroy = nv30_destroy;
+	nv30->pipe.get_name = nv30_get_name;
+	nv30->pipe.get_vendor = nv30_get_vendor;
+	nv30->pipe.get_param = nv30_get_param;
+	nv30->pipe.get_paramf = nv30_get_paramf;
+
+	nv30->pipe.draw_arrays = nv30_draw_arrays;
+	nv30->pipe.draw_elements = nv30_draw_elements;
+	nv30->pipe.clear = nv30_clear;
+
+	nv30->pipe.flush = nv30_flush;
+
+	nv30_init_query_functions(nv30);
+	nv30_init_surface_functions(nv30);
+	nv30_init_state_functions(nv30);
+	nv30_init_miptree_functions(nv30);
+
+	nv30->draw = draw_create();
+	assert(nv30->draw);
+	draw_set_rasterize_stage(nv30->draw, nv30_draw_render_stage(nv30));
+
+	return &nv30->pipe;
+}
+	
diff --git a/src/mesa/pipe/nv30/nv30_context.h b/src/mesa/pipe/nv30/nv30_context.h
new file mode 100644
index 0000000000..d2262c5065
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_context.h
@@ -0,0 +1,136 @@
+#ifndef __NV30_CONTEXT_H__
+#define __NV30_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/draw/draw_vertex.h"
+
+#include "pipe/nouveau/nouveau_winsys.h"
+#include "pipe/nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv30_context *ctx = nv30
+#include "pipe/nouveau/nouveau_push.h"
+
+#include "nv30_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV30_NEW_VERTPROG	(1 << 1)
+#define NV30_NEW_FRAGPROG	(1 << 2)
+#define NV30_NEW_ARRAYS		(1 << 3)
+
+struct nv30_context {
+	struct pipe_context pipe;
+	struct nouveau_winsys *nvws;
+
+	struct draw_context *draw;
+
+	int chipset;
+	struct nouveau_grobj *rankine;
+	struct nouveau_notifier *sync;
+
+	/* query objects */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	uint32_t dirty;
+
+	struct nv30_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv30_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[16];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+	struct {
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv30_vertex_program *active;
+
+		struct nv30_vertex_program *current;
+		struct pipe_buffer *constant_buf;
+	} vertprog;
+
+	struct {
+		struct nv30_fragment_program *active;
+
+		struct nv30_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_ATTRIB_MAX];
+	struct pipe_vertex_element vtxelt[PIPE_ATTRIB_MAX];
+};
+
+static inline struct nv30_context *
+nv30_context(struct pipe_context *pipe)
+{
+	return (struct nv30_context *)pipe;
+}
+
+extern void nv30_init_state_functions(struct nv30_context *nv30);
+extern void nv30_init_surface_functions(struct nv30_context *nv30);
+extern void nv30_init_miptree_functions(struct nv30_context *nv30);
+extern void nv30_init_query_functions(struct nv30_context *nv30);
+
+/* nv30_draw.c */
+extern struct draw_stage *nv30_draw_render_stage(struct nv30_context *nv30);
+
+/* nv30_vertprog.c */
+extern void nv30_vertprog_translate(struct nv30_context *,
+				    struct nv30_vertex_program *);
+extern void nv30_vertprog_bind(struct nv30_context *,
+			       struct nv30_vertex_program *);
+extern void nv30_vertprog_destroy(struct nv30_context *,
+				  struct nv30_vertex_program *);
+
+/* nv30_fragprog.c */
+extern void nv30_fragprog_translate(struct nv30_context *,
+				    struct nv30_fragment_program *);
+extern void nv30_fragprog_bind(struct nv30_context *,
+			       struct nv30_fragment_program *);
+extern void nv30_fragprog_destroy(struct nv30_context *,
+				  struct nv30_fragment_program *);
+
+/* nv30_fragtex.c */
+extern void nv30_fragtex_bind(struct nv30_context *);
+
+/* nv30_state.c and friends */
+extern void nv30_emit_hw_state(struct nv30_context *nv30);
+extern void nv30_state_tex_update(struct nv30_context *nv30);
+
+/* nv30_vbo.c */
+extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv30_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv30_clear.c */
+extern void nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/mesa/pipe/nv30/nv30_dma.h b/src/mesa/pipe/nv30/nv30_dma.h
new file mode 100644
index 0000000000..6eff6b4290
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_dma.h
@@ -0,0 +1,66 @@
+#ifndef __NV30_DMA_H__
+#define __NV30_DMA_H__
+
+#include "pipe/nouveau/nouveau_winsys.h"
+
+#define OUT_RING(data) do {                                                    \
+	(*nv30->nvws->channel->pushbuf->cur++) = (data);                       \
+} while(0)
+
+#define OUT_RINGp(src,size) do {                                               \
+	memcpy(nv30->nvws->channel->pushbuf->cur, (src), (size) * 4);          \
+	nv30->nvws->channel->pushbuf->cur += (size);                           \
+} while(0)
+
+#define OUT_RINGf(data) do {                                                   \
+	union { float v; uint32_t u; } c;                                      \
+	c.v = (data);                                                          \
+	OUT_RING(c.u);                                                         \
+} while(0)
+
+#define BEGIN_RING(obj,mthd,size) do {                                         \
+	if (nv30->nvws->channel->pushbuf->remaining < ((size) + 1))            \
+		nv30->nvws->push_flush(nv30->nvws->channel, ((size) + 1));     \
+	OUT_RING((nv30->obj->subc << 13) | ((size) << 18) | (mthd));           \
+	nv30->nvws->channel->pushbuf->remaining -= ((size) + 1);               \
+} while(0)
+
+#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
+	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
+} while(0)
+
+#define FIRE_RING() do {                                                       \
+	nv30->nvws->push_flush(nv30->nvws->channel, 0);                        \
+} while(0)
+
+#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
+	nv30->nvws->push_reloc(nv30->nvws->channel,                            \
+			       nv30->nvws->channel->pushbuf->cur,              \
+			       (struct nouveau_bo *)(bo),                      \
+			       (data), (flags), (vor), (tor));                 \
+	OUT_RING(0);                                                           \
+} while(0)
+
+/* Raw data + flags depending on FB/TT buffer */
+#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
+	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
+} while(0)
+
+/* FB/TT object handle */
+#define OUT_RELOCo(bo,flags) do {                                              \
+	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
+		  nv30->nvws->channel->vram->handle,                           \
+		  nv30->nvws->channel->gart->handle);                          \
+} while(0)
+
+/* Low 32-bits of offset */
+#define OUT_RELOCl(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
+} while(0)
+
+/* High 32-bits of offset */
+#define OUT_RELOCh(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
+} while(0)
+
+#endif
diff --git a/src/mesa/pipe/nv30/nv30_draw.c b/src/mesa/pipe/nv30/nv30_draw.c
new file mode 100644
index 0000000000..bdeb975ca1
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_draw.c
@@ -0,0 +1,62 @@
+#include "pipe/draw/draw_private.h"
+#include "pipe/p_util.h"
+
+#include "nv30_context.h"
+
+struct nv30_draw_stage {
+	struct draw_stage draw;
+	struct nv30_context *nv30;
+};
+
+static void
+nv30_draw_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_flush(struct draw_stage *draw, unsigned flags)
+{
+}
+
+static void
+nv30_draw_reset_stipple_counter(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_destroy(struct draw_stage *draw)
+{
+	free(draw);
+}
+
+struct draw_stage *
+nv30_draw_render_stage(struct nv30_context *nv30)
+{
+	struct nv30_draw_stage *nv30draw = CALLOC_STRUCT(nv30_draw_stage);
+
+	nv30draw->nv30 = nv30;
+	nv30draw->draw.draw = nv30->draw;
+	nv30draw->draw.point = nv30_draw_point;
+	nv30draw->draw.line = nv30_draw_line;
+	nv30draw->draw.tri = nv30_draw_tri;
+	nv30draw->draw.flush = nv30_draw_flush;
+	nv30draw->draw.reset_stipple_counter = nv30_draw_reset_stipple_counter;
+	nv30draw->draw.destroy = nv30_draw_destroy;
+
+	return &nv30draw->draw;
+}
+
diff --git a/src/mesa/pipe/nv30/nv30_fragprog.c b/src/mesa/pipe/nv30/nv30_fragprog.c
new file mode 100644
index 0000000000..0233873d92
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_fragprog.c
@@ -0,0 +1,834 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+
+#include "nv30_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV30_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV30_FP_OP_COND_TR
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+#define scale(s,v) nv30_sr_scale((s), NV30_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv30_fpc {
+	struct nv30_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+	int num_regs;
+
+	uint depth_id;
+	uint colour_id;
+
+	unsigned inst_offset;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv30_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv30_sreg
+temp(struct nv30_fpc *fpc)
+{
+	int idx;
+
+	idx  = fpc->temp_temp_count++;
+	idx += fpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static INLINE struct nv30_sreg
+constant(struct nv30_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_fp_arith((cc), (s), NV30_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv30_fp_tex((cc), (s), NV30_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv30_fpc *fpc, int size)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv30_fpc *fpc, int pos, struct nv30_sreg src)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_INPUT:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV30_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		sr |= NV30_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV30SR_TEMP:
+		sr |= (NV30_FP_REG_TYPE_TEMP << NV30_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_FP_REG_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		grow_insns(fpc, 4);
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv30_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV30_FP_REG_TYPE_CONST << NV30_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV30_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv30_fpc *fpc, struct nv30_sreg dst)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV30SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV30_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV30SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV30_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv30_fp_arith(struct nv30_fpc *fpc, int sat, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV30_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NV30_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV30_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV30_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV30_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV30_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV30_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV30_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV30_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV30_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV30_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv30_fp_tex(struct nv30_fpc *fpc, int sat, int op, int unit,
+	    struct nv30_sreg dst, int mask,
+	    struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	nv30_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV30_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index + 1);
+		if (fpc->high_temp < src.index)
+			fpc->high_temp = src.index;
+		break;
+	/* This is clearly insane, but gallium hands us shaders like this.
+	 * Luckily fragprog results are just temp regs..
+	 */
+	case TGSI_FILE_OUTPUT:
+		if (fsrc->SrcRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	int idx;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		if (fdst->DstRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	case TGSI_FILE_TEMPORARY:
+		idx = fdst->DstRegister.Index + 1;
+		if (fpc->high_temp < idx)
+			fpc->high_temp = idx;
+		return nv30_sr(NV30SR_TEMP, idx);
+	case TGSI_FILE_NULL:
+		return nv30_sr(NV30SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv30_sr(NV30SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv30_sreg *src)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv30_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg src[3], dst, tmp;
+	int mask, sat, unit;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	fpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				NOUVEAU_MSG("extra src attr %d\n",
+					 fsrc->SrcRegister.Index);
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV30_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		dst = nv30_sr(NV30SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV30_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		tmp = temp(fpc);
+		arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, tmp, MASK_X,
+		      swz(src[0], X, X, X, X), none, none);
+		arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(fpc, sat, EX2, dst, mask,
+		      swz(tmp, X, X, X, X), none, none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
+		arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
+		arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
+		      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+		arith(fpc, sat, MAD, dst, mask,
+		      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		break;
+	case TGSI_OPCODE_RSQ:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
+		      abs(swz(src[0], X, X, X, X)), none, none);
+		arith(fpc, sat, EX2, dst, mask,
+		      neg(swz(tmp, X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		if (finst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide ==
+				TGSI_EXTSWIZZLE_W) {
+			tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		} else
+			tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->u.DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_output(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		fpc->depth_id = fdec->u.DeclarationRange.First;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		fpc->colour_id = fdec->u.DeclarationRange.First;
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+void
+nv30_fragprog_translate(struct nv30_context *nv30,
+			struct nv30_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_fpc *fpc = NULL;
+
+	fpc = calloc(1, sizeof(struct nv30_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->high_temp = -1;
+	fpc->num_regs = 2;
+
+	tgsi_parse_init(&parse, fp->pipe->tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv30_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			int i;
+			
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			for (i = 0; i < imm->Immediate.Size; i++)
+				vals[i] = imm->u.ImmediateFloat32[i].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= (fpc->num_regs-1)/2;
+	fp->fp_reg_control = (1<<16)|0x4;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+	fp->on_hw = FALSE;
+out_err:
+	tgsi_parse_free(&parse);
+	free(fpc);
+}
+
+void
+nv30_fragprog_bind(struct nv30_context *nv30, struct nv30_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	int i;
+
+	if (!fp->translated) {
+		nv30_fragprog_translate(nv30, fp);
+		if (!fp->translated)
+			assert(0);
+	}
+
+	if (fp->nr_consts) {
+		float *map = ws->buffer_map(ws, nv30->fragprog.constant_buf,
+					    PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv30_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			fp->on_hw = 0;
+		}
+		ws->buffer_unmap(ws, nv30->fragprog.constant_buf);
+	}
+
+	if (!fp->on_hw) {
+		const uint32_t le = 1;
+		uint32_t *map;
+
+		if (!fp->buffer)
+			fp->buffer = ws->buffer_create(ws, 0x100, 0,
+						       fp->insn_len * 4);
+		map = ws->buffer_map(ws, fp->buffer,
+				     PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+		for (i = 0; i < fp->insn_len; i++) {
+			NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		}
+#endif
+
+		if ((*(const uint8_t *)&le)) {
+			for (i = 0; i < fp->insn_len; i++) {
+				map[i] = fp->insn[i];
+			}
+		} else {
+			/* Weird swapping for big-endian chips */
+			for (i = 0; i < fp->insn_len; i++) {
+				map[i] = ((fp->insn[i] & 0xffff) << 16) |
+					  ((fp->insn[i] >> 16) & 0xffff);
+			}
+		}
+
+		ws->buffer_unmap(ws, fp->buffer);
+		fp->on_hw = TRUE;
+	}
+
+	BEGIN_RING(rankine, NV34TCL_FP_CONTROL, 1);
+	OUT_RING  (fp->fp_control);
+	BEGIN_RING(rankine, NV34TCL_FP_REG_CONTROL, 1);
+	OUT_RING  (fp->fp_reg_control);
+
+	nv30->fragprog.active = fp;
+}
+
+void
+nv30_fragprog_destroy(struct nv30_context *nv30,
+		      struct nv30_fragment_program *fp)
+{
+	if (fp->insn_len)
+		free(fp->insn);
+}
+
diff --git a/src/mesa/pipe/nv30/nv30_fragtex.c b/src/mesa/pipe/nv30/nv30_fragtex.c
new file mode 100644
index 0000000000..e75b1f7f28
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_fragtex.c
@@ -0,0 +1,160 @@
+#include "nv30_context.h"
+
+static inline int log2i(int i)
+{
+	int r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w),           \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+//	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(U_L8          , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(U_A8          , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(U_I8          , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(U_A8_L8       , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
+//	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+//	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+//	_(RGB_DXT1      , 0x86,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0x00, 0x00),
+//	_(RGBA_DXT1     , 0x86,   S1,   S1,   S1,   S1, X, Y, Z, W, 0x00, 0x00),
+//	_(RGBA_DXT3     , 0x87,   S1,   S1,   S1,   S1, X, Y, Z, W, 0x00, 0x00),
+//	_(RGBA_DXT5     , 0x88,   S1,   S1,   S1,   S1, X, Y, Z, W, 0x00, 0x00),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv30_fragtex_build(struct nv30_context *nv30, int unit)
+{
+	struct nv30_sampler_state *ps = nv30->tex_sampler[unit];
+	struct nv30_miptree *nv30mt = nv30->tex_miptree[unit];
+	struct pipe_texture *pt = &nv30mt->base;
+	struct nv30_texture_format *tf;
+	uint32_t txf, txs, txp;
+	int swizzled = 0; /*XXX: implement in region code? */
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level - pt->first_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+/*	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TEX_FORMAT_CUBIC;*/
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= (3<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	txs = tf->swizzle;
+
+	BEGIN_RING(rankine, NV34TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv30mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv30mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+}
+
+void
+nv30_fragtex_bind(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv30->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(rankine, NV34TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv30->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv30_fragtex_build(nv30, unit);
+	}
+
+	nv30->fp_samplers = fp->samplers;
+}
+
diff --git a/src/mesa/pipe/nv30/nv30_miptree.c b/src/mesa/pipe/nv30/nv30_miptree.c
new file mode 100644
index 0000000000..75e9b993c1
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_miptree.c
@@ -0,0 +1,105 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+#include "pipe/p_inlines.h"
+
+#include "nv30_context.h"
+
+static void
+nv30_miptree_layout(struct nv30_miptree *nv30mt)
+{
+	struct pipe_texture *pt = &nv30mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = pt->first_level; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+
+		if (swizzled)
+			nv30mt->level[l].pitch = pt->width[l] * pt->cpp;
+		else
+			nv30mt->level[l].pitch = pt->width[0] * pt->cpp;
+		nv30mt->level[l].pitch = (nv30mt->level[l].pitch + 63) & ~63;
+
+		nv30mt->level[l].image_offset =
+			calloc(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = pt->first_level; l <= pt->last_level; l++) {
+			nv30mt->level[l].image_offset[f] = offset;
+			offset += nv30mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv30mt->total_size = offset;
+}
+
+static void
+nv30_miptree_create(struct pipe_context *pipe, struct pipe_texture **pt)
+{
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv30_miptree *nv30mt;
+
+	nv30mt = realloc(*pt, sizeof(struct nv30_miptree));
+	if (!nv30mt)
+		return;
+	*pt = NULL;
+
+	nv30_miptree_layout(nv30mt);
+
+	nv30mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   nv30mt->total_size);
+	if (!nv30mt->buffer) {
+		free(nv30mt);
+		return;
+	}
+	
+	*pt = &nv30mt->base;
+}
+
+static void
+nv30_miptree_release(struct pipe_context *pipe, struct pipe_texture **pt)
+{
+	struct pipe_winsys *ws = pipe->winsys;
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv30_miptree *nv30mt = (struct nv30_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(ws, &nv30mt->buffer, NULL);
+		for (l = mt->first_level; l <= mt->last_level; l++) {
+			if (nv30mt->level[l].image_offset)
+				free(nv30mt->level[l].image_offset);
+		}
+		free(nv30mt);
+	}
+}
+
+void
+nv30_init_miptree_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.texture_create = nv30_miptree_create;
+	nv30->pipe.texture_release = nv30_miptree_release;
+}
+
diff --git a/src/mesa/pipe/nv30/nv30_query.c b/src/mesa/pipe/nv30/nv30_query.c
new file mode 100644
index 0000000000..ea74c0f5f1
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_query.c
@@ -0,0 +1,112 @@
+#include "pipe/p_context.h"
+
+#include "nv30_context.h"
+
+struct nv30_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static inline struct nv30_query *
+nv30_query(struct pipe_query *pipe)
+{
+	return (struct nv30_query *)pipe;
+}
+
+static struct pipe_query *
+nv30_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv30_query *q;
+
+	q = calloc(1, sizeof(struct nv30_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv30_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	if (q->object)
+		nv30->nvws->res_free(&q->object);
+	free(q);
+}
+
+static void
+nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (nv30->nvws->res_alloc(nv30->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv30->nvws->notifier_reset(nv30->query, q->object->start);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING();
+}
+
+static boolean
+nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64 *result)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv30->query, q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv30->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv30->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv30_init_query_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_query = nv30_query_create;
+	nv30->pipe.destroy_query = nv30_query_destroy;
+	nv30->pipe.begin_query = nv30_query_begin;
+	nv30->pipe.end_query = nv30_query_end;
+	nv30->pipe.get_query_result = nv30_query_result;
+}
diff --git a/src/mesa/pipe/nv30/nv30_shader.h b/src/mesa/pipe/nv30/nv30_shader.h
new file mode 100644
index 0000000000..dd3a36f78f
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_shader.h
@@ -0,0 +1,490 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -  
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#  define NV30_VP_INST_COND_FL  0 /* guess */  
+#  define NV30_VP_INST_COND_LT  1  
+#  define NV30_VP_INST_COND_EQ  2
+#  define NV30_VP_INST_COND_LE  3
+#  define NV30_VP_INST_COND_GT  4
+#  define NV30_VP_INST_COND_NE  5
+#  define NV30_VP_INST_COND_GE  6
+#  define NV30_VP_INST_COND_TR  7 /* guess */
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#  define NV30_VP_INST_OP_NOP  0x00
+#  define NV30_VP_INST_OP_RCP  0x02
+#  define NV30_VP_INST_OP_RCC  0x03
+#  define NV30_VP_INST_OP_RSQ  0x04
+#  define NV30_VP_INST_OP_EXP  0x05
+#  define NV30_VP_INST_OP_LOG  0x06
+#  define NV30_VP_INST_OP_LIT  0x07
+#  define NV30_VP_INST_OP_BRA  0x09
+#  define NV30_VP_INST_OP_CAL  0x0B
+#  define NV30_VP_INST_OP_RET  0x0C
+#  define NV30_VP_INST_OP_LG2  0x0D
+#  define NV30_VP_INST_OP_EX2  0x0E
+#  define NV30_VP_INST_OP_SIN  0x0F
+#  define NV30_VP_INST_OP_COS  0x10
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#  define NV30_VP_INST_OP_NOPV  0x00
+#  define NV30_VP_INST_OP_MOV  0x01
+#  define NV30_VP_INST_OP_MUL  0x02
+#  define NV30_VP_INST_OP_ADD  0x03
+#  define NV30_VP_INST_OP_MAD  0x04
+#  define NV30_VP_INST_OP_DP3  0x05
+#  define NV30_VP_INST_OP_DP4  0x07
+#  define NV30_VP_INST_OP_DPH  0x06
+#  define NV30_VP_INST_OP_DST  0x08
+#  define NV30_VP_INST_OP_MIN  0x09
+#  define NV30_VP_INST_OP_MAX  0x0A
+#  define NV30_VP_INST_OP_SLT  0x0B
+#  define NV30_VP_INST_OP_SGE  0x0C
+#  define NV30_VP_INST_OP_ARL  0x0D
+#  define NV30_VP_INST_OP_FRC  0x0E
+#  define NV30_VP_INST_OP_FLR  0x0F
+#  define NV30_VP_INST_OP_SEQ  0x10
+#  define NV30_VP_INST_OP_SFL  0x11
+#  define NV30_VP_INST_OP_SGT  0x12
+#  define NV30_VP_INST_OP_SLE  0x13
+#  define NV30_VP_INST_OP_SNE  0x14
+#  define NV30_VP_INST_OP_STR  0x15
+#  define NV30_VP_INST_OP_SSG  0x16
+#  define NV30_VP_INST_OP_ARR  0x17
+#  define NV30_VP_INST_OP_ARA  0x18
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#  define NV30_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#  define NV30_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#  define NV30_VP_INST_IN_NORMAL  2    
+#  define NV30_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#  define NV30_VP_INST_IN_COL1  4
+#  define NV30_VP_INST_IN_FOGC  5
+#  define NV30_VP_INST_IN_TC0  8
+#  define NV30_VP_INST_IN_TC(n)  (8+n)
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+#define NV30_VP_INST_LAST                           (1 << 0)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ */
+
+//== Opcode / Destination selection ==
+#define NV30_FP_OP_PROGRAM_END          (1 << 0)
+#define NV30_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV30_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NV30_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NV30_FP_OP_OUTMASK_SHIFT        9
+#define NV30_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NV30_FP_OP_OUT_X  (1<<9)
+#  define NV30_FP_OP_OUT_Y  (1<<10)
+#  define NV30_FP_OP_OUT_Z  (1<<11)
+#  define NV30_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV30_FP_OP_INPUT_SRC_SHIFT        13
+#define NV30_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NV30_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NV30_FP_OP_INPUT_SRC_COL0  0x1
+#  define NV30_FP_OP_INPUT_SRC_COL1  0x2
+#  define NV30_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NV30_FP_OP_INPUT_SRC_TC0    0x4
+#  define NV30_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#define NV30_FP_OP_TEX_UNIT_SHIFT        17
+#define NV30_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NV30_FP_OP_PRECISION_SHIFT        22
+#define NV30_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NV30_FP_PRECISION_FP32  0
+#   define NV30_FP_PRECISION_FP16  1
+#   define NV30_FP_PRECISION_FX12  2
+#define NV30_FP_OP_OPCODE_SHIFT          24
+#define NV30_FP_OP_OPCODE_MASK          (0x3F << 24)
+#  define NV30_FP_OP_OPCODE_NOP  0x00
+#  define NV30_FP_OP_OPCODE_MOV  0x01
+#  define NV30_FP_OP_OPCODE_MUL  0x02
+#  define NV30_FP_OP_OPCODE_ADD  0x03
+#  define NV30_FP_OP_OPCODE_MAD  0x04
+#  define NV30_FP_OP_OPCODE_DP3  0x05
+#  define NV30_FP_OP_OPCODE_DP4  0x06
+#  define NV30_FP_OP_OPCODE_DST  0x07
+#  define NV30_FP_OP_OPCODE_MIN  0x08
+#  define NV30_FP_OP_OPCODE_MAX  0x09
+#  define NV30_FP_OP_OPCODE_SLT  0x0A
+#  define NV30_FP_OP_OPCODE_SGE  0x0B
+#  define NV30_FP_OP_OPCODE_SLE  0x0C
+#  define NV30_FP_OP_OPCODE_SGT  0x0D
+#  define NV30_FP_OP_OPCODE_SNE  0x0E
+#  define NV30_FP_OP_OPCODE_SEQ  0x0F
+#  define NV30_FP_OP_OPCODE_FRC  0x10
+#  define NV30_FP_OP_OPCODE_FLR  0x11
+#  define NV30_FP_OP_OPCODE_KIL  0x12
+#  define NV30_FP_OP_OPCODE_PK4B   0x13
+#  define NV30_FP_OP_OPCODE_UP4B   0x14
+#  define NV30_FP_OP_OPCODE_DDX  0x15 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_DDY  0x16 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_TEX  0x17
+#  define NV30_FP_OP_OPCODE_TXP  0x18
+#  define NV30_FP_OP_OPCODE_TXD  0x19
+#  define NV30_FP_OP_OPCODE_RCP  0x1A
+#  define NV30_FP_OP_OPCODE_RSQ  0x1B
+#  define NV30_FP_OP_OPCODE_EX2  0x1C
+#  define NV30_FP_OP_OPCODE_LG2  0x1D
+#  define NV30_FP_OP_OPCODE_LIT  0x1E
+#  define NV30_FP_OP_OPCODE_LRP  0x1F
+#  define NV30_FP_OP_OPCODE_STR  0x20 
+#  define NV30_FP_OP_OPCODE_SFL  0x21
+#  define NV30_FP_OP_OPCODE_COS  0x22
+#  define NV30_FP_OP_OPCODE_SIN  0x23
+#  define NV30_FP_OP_OPCODE_PK2H   0x24
+#  define NV30_FP_OP_OPCODE_UP2H   0x25
+#  define NV30_FP_OP_OPCODE_POW  0x26
+#  define NV30_FP_OP_OPCODE_PK4UB  0x27
+#  define NV30_FP_OP_OPCODE_UP4UB  0x28
+#  define NV30_FP_OP_OPCODE_PK2US  0x29
+#  define NV30_FP_OP_OPCODE_UP2US  0x2A
+#  define NV30_FP_OP_OPCODE_DP2A   0x2E
+#  define NV30_FP_OP_OPCODE_TXB  0x31
+#  define NV30_FP_OP_OPCODE_RFL  0x36
+#  define NV30_FP_OP_OPCODE_DIV  0x3A
+#define NV30_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV30_FP_OP_OUT_ABS          (1 << 29)
+#define NV30_FP_OP_COND_SWZ_W_SHIFT        27
+#define NV30_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NV30_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NV30_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NV30_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NV30_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NV30_FP_OP_COND_SWZ_X_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NV30_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NV30_FP_OP_COND_SHIFT          18
+#define NV30_FP_OP_COND_MASK          (0x07 << 18)
+#  define NV30_FP_OP_COND_FL  0
+#  define NV30_FP_OP_COND_LT  1
+#  define NV30_FP_OP_COND_EQ  2
+#  define NV30_FP_OP_COND_LE  3
+#  define NV30_FP_OP_COND_GT  4
+#  define NV30_FP_OP_COND_NE  5
+#  define NV30_FP_OP_COND_GE  6
+#  define NV30_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV30_FP_OP_DST_SCALE_SHIFT        28
+#define NV30_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NV30_FP_OP_DST_SCALE_1X                                                0
+#define NV30_FP_OP_DST_SCALE_2X                                                1
+#define NV30_FP_OP_DST_SCALE_4X                                                2
+#define NV30_FP_OP_DST_SCALE_8X                                                3
+#define NV30_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV30_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV30_FP_OP_DST_SCALE_INV_8X                                            7
+
+
+/* high order bits of SRC2 */
+#define NV30_FP_OP_INDEX_INPUT          (1 << 30)
+
+//== Register selection ==
+#define NV30_FP_REG_TYPE_SHIFT          0
+#define NV30_FP_REG_TYPE_MASK          (3 << 0)
+#  define NV30_FP_REG_TYPE_TEMP  0
+#  define NV30_FP_REG_TYPE_INPUT  1
+#  define NV30_FP_REG_TYPE_CONST  2
+#define NV30_FP_REG_SRC_SHIFT          2 /* uncertain */
+#define NV30_FP_REG_SRC_MASK          (31 << 2)
+#define NV30_FP_REG_SRC_HALF          (1 << 8)
+#define NV30_FP_REG_SWZ_ALL_SHIFT        9
+#define NV30_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NV30_FP_REG_SWZ_X_SHIFT          9
+#define NV30_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NV30_FP_REG_SWZ_Y_SHIFT          11
+#define NV30_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NV30_FP_REG_SWZ_Z_SHIFT          13
+#define NV30_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NV30_FP_REG_SWZ_W_SHIFT          15
+#define NV30_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NV30_FP_SWIZZLE_X  0
+#  define NV30_FP_SWIZZLE_Y  1
+#  define NV30_FP_SWIZZLE_Z  2
+#  define NV30_FP_SWIZZLE_W  3
+#define NV30_FP_REG_NEGATE          (1 << 17)
+
+#define NV30SR_NONE	0
+#define NV30SR_OUTPUT	1
+#define NV30SR_INPUT	2
+#define NV30SR_TEMP	3
+#define NV30SR_CONST	4
+
+struct nv30_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv30_sreg
+nv30_sr(int type, int index)
+{
+	struct nv30_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_swz(struct nv30_sreg src, int x, int y, int z, int w)
+{
+	struct nv30_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_neg(struct nv30_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_abs(struct nv30_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_scale(struct nv30_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/mesa/pipe/nv30/nv30_state.c b/src/mesa/pipe/nv30/nv30_state.c
new file mode 100644
index 0000000000..c29a644809
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_state.c
@@ -0,0 +1,740 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static void *
+nv30_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv30_blend_state *cb;
+
+	cb = malloc(sizeof(struct nv30_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+	cb->b_eqn = ((nvgl_blend_eqn(cso->alpha_func) << 16) |
+		     (nvgl_blend_eqn(cso->rgb_func)));
+
+	cb->l_enable = cso->logicop_enable ? 1 : 0;
+	cb->l_op = nvgl_logicop_func(cso->logicop_func);
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv30_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_blend_state *cb = hwcso;
+
+	BEGIN_RING(rankine, NV34TCL_DITHER_ENABLE, 1);
+	OUT_RING  (cb->d_enable);
+
+	BEGIN_RING(rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (cb->b_enable);
+	OUT_RING  (cb->b_srcfunc);
+	OUT_RING  (cb->b_dstfunc);
+	BEGIN_RING(rankine, NV34TCL_BLEND_FUNC_EQUATION, 1);
+	OUT_RING  (cb->b_eqn);
+
+	BEGIN_RING(rankine, NV34TCL_COLOR_MASK, 1);
+	OUT_RING  (cb->c_mask);
+
+	BEGIN_RING(rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (cb->l_enable);
+	OUT_RING  (cb->l_op);
+}
+
+static void
+nv30_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+/*	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP;
+		break;*/
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static void *
+nv30_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv30_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = malloc(sizeof(struct nv30_sampler_state));
+
+	ps->fmt = 0;
+	if (!cso->normalized_coords)
+		ps->fmt |= NV34TCL_TX_FORMAT_RECT;
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy >= 2.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv30_sampler_state_bind(struct pipe_context *pipe, unsigned unit,
+			void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_sampler_state *ps = hwcso;
+
+	nv30->tex_sampler[unit] = ps;
+	nv30->dirty_samplers |= (1 << unit);
+}
+
+static void
+nv30_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void
+nv30_set_sampler_texture(struct pipe_context *pipe, unsigned unit,
+			 struct pipe_texture *miptree)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->tex_miptree[unit] = (struct nv30_miptree *)miptree;
+	nv30->dirty_samplers |= (1 << unit);
+}
+
+static void *
+nv30_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv30_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = malloc(sizeof(struct nv30_rasterizer_state));
+
+	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+	rs->line_stipple_en = cso->line_stipple_enable ? 1 : 0;
+	rs->line_stipple = (cso->line_stipple_pattern << 16) |
+			    cso->line_stipple_factor;
+
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+	rs->poly_stipple_en = cso->poly_stipple_enable ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV34TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV34TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV34TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV34TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV34TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV34TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV34TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv30_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_rasterizer_state *rs = hwcso;
+
+	BEGIN_RING(rankine, NV34TCL_SHADE_MODEL, 1);
+	OUT_RING  (rs->shade_model);
+
+	BEGIN_RING(rankine, NV34TCL_LINE_WIDTH, 2);
+	OUT_RING  (rs->line_width);
+	OUT_RING  (rs->line_smooth_en);
+	BEGIN_RING(rankine, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	OUT_RING  (rs->line_stipple_en);
+	OUT_RING  (rs->line_stipple);
+
+	BEGIN_RING(rankine, NV34TCL_POINT_SIZE, 1);
+	OUT_RING  (rs->point_size);
+
+	BEGIN_RING(rankine, NV34TCL_POLYGON_MODE_FRONT, 6);
+	OUT_RING  (rs->poly_mode_front);
+	OUT_RING  (rs->poly_mode_back);
+	OUT_RING  (rs->cull_face);
+	OUT_RING  (rs->front_face);
+	OUT_RING  (rs->poly_smooth_en);
+	OUT_RING  (rs->cull_face_en);
+
+	BEGIN_RING(rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (rs->poly_stipple_en);
+
+	BEGIN_RING(rankine, NV34TCL_POINT_SPRITE, 1);
+	OUT_RING  (rs->point_sprite);
+}
+
+static void
+nv30_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void
+nv30_translate_stencil(const struct pipe_depth_stencil_alpha_state *cso,
+		       unsigned idx, struct nv30_stencil_push *hw)
+{
+	hw->enable = cso->stencil[idx].enabled ? 1 : 0;
+	hw->wmask = cso->stencil[idx].write_mask;
+	hw->func = nvgl_comparison_op(cso->stencil[idx].func);
+	hw->ref	= cso->stencil[idx].ref_value;
+	hw->vmask = cso->stencil[idx].value_mask;
+	hw->fail = nvgl_stencil_op(cso->stencil[idx].fail_op);
+	hw->zfail = nvgl_stencil_op(cso->stencil[idx].zfail_op);
+	hw->zpass = nvgl_stencil_op(cso->stencil[idx].zpass_op);
+}
+
+static void *
+nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv30_depth_stencil_alpha_state *hw;
+
+	hw = malloc(sizeof(struct nv30_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	nv30_translate_stencil(cso, 0, &hw->stencil.front);
+	nv30_translate_stencil(cso, 1, &hw->stencil.back);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref);
+
+	return (void *)hw;
+}
+
+static void
+nv30_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_depth_stencil_alpha_state *hw = hwcso;
+
+	BEGIN_RING(rankine, NV34TCL_DEPTH_FUNC, 3);
+	OUT_RINGp ((uint32_t *)&hw->depth, 3);
+	BEGIN_RING(rankine, NV34TCL_STENCIL_FRONT_ENABLE, 16);
+	OUT_RINGp ((uint32_t *)&hw->stencil.front, 8);
+	OUT_RINGp ((uint32_t *)&hw->stencil.back, 8);
+	BEGIN_RING(rankine, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	OUT_RINGp ((uint32_t *)&hw->alpha.enabled, 3);
+}
+
+static void
+nv30_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv30_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv30_vertex_program *vp;
+
+	vp = calloc(1, sizeof(struct nv30_vertex_program));
+	vp->pipe = cso;
+
+	return (void *)vp;
+}
+
+static void
+nv30_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_vertex_program *vp = hwcso;
+
+	nv30->vertprog.current = vp;
+	nv30->dirty |= NV30_NEW_VERTPROG;
+}
+
+static void
+nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_vertex_program *vp = hwcso;
+
+	nv30_vertprog_destroy(nv30, vp);
+	free(vp);
+}
+
+static void *
+nv30_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv30_fragment_program *fp;
+
+	fp = calloc(1, sizeof(struct nv30_fragment_program));
+	fp->pipe = cso;
+
+	return (void *)fp;
+}
+
+static void
+nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_fragment_program *fp = hwcso;
+
+	nv30->fragprog.current = fp;
+	nv30->dirty |= NV30_NEW_FRAGPROG;
+}
+
+static void
+nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_fragment_program *fp = hwcso;
+
+	nv30_fragprog_destroy(nv30, fp);
+	free(fp);
+}
+
+static void
+nv30_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	BEGIN_RING(rankine, NV34TCL_BLEND_FUNC_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(bcol->color[3]) << 24) |
+		   (float_to_ubyte(bcol->color[0]) << 16) |
+		   (float_to_ubyte(bcol->color[1]) <<  8) |
+		   (float_to_ubyte(bcol->color[2]) <<  0));
+}
+
+static void
+nv30_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv30->vertprog.constant_buf = buf->buffer;
+		nv30->dirty |= NV30_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv30->fragprog.constant_buf = buf->buffer;
+		nv30->dirty |= NV30_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv30_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct pipe_surface *rt[4], *zeta;
+	uint32_t rt_enable, rt_format, w, h;
+	int i, colour_format = 0, zeta_format = 0;
+
+	rt_enable = 0;
+	for (i = 0; i < 4; i++) {
+		if (!fb->cbufs[i])
+			continue;
+
+		if (colour_format) {
+			assert(w == fb->cbufs[i]->width);
+			assert(h == fb->cbufs[i]->height);
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			w = fb->cbufs[i]->width;
+			h = fb->cbufs[i]->height;
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR1 | NV34TCL_RT_ENABLE_COLOR2 |
+			 NV34TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
+		BEGIN_RING(rankine, NV34TCL_COLOR0_PITCH, 1);
+		OUT_RING  (rt[0]->pitch * rt[0]->cpp);
+		nv30->rt[0] = rt[0]->buffer;
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		BEGIN_RING(rankine, NV34TCL_COLOR1_PITCH, 2);
+		OUT_RING  (rt[1]->pitch * rt[1]->cpp);
+		nv30->rt[1] = rt[1]->buffer;
+	}
+
+	if (zeta_format) {
+		BEGIN_RING(rankine, NV34TCL_ZETA_PITCH, 1);
+		OUT_RING  (zeta->pitch * zeta->cpp);
+		nv30->zeta = zeta->buffer;
+	}
+
+	nv30->rt_enable = rt_enable;
+	BEGIN_RING(rankine, NV34TCL_RT_ENABLE, 1);
+	OUT_RING  (rt_enable);
+	BEGIN_RING(rankine, NV34TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(rankine, NV34TCL_VIEWPORT_HORIZ, 2);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	BEGIN_RING(rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0);
+	OUT_RING  (((h - 1) << 16) | 0);
+}
+
+static void
+nv30_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	BEGIN_RING(rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+	OUT_RINGp ((uint32_t *)stipple->stipple, 32);
+}
+
+static void
+nv30_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	BEGIN_RING(rankine, NV34TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);
+}
+
+static void
+nv30_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	BEGIN_RING(rankine, NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+	OUT_RINGf (vpt->translate[0]);
+	OUT_RINGf (vpt->translate[1]);
+	OUT_RINGf (vpt->translate[2]);
+	OUT_RINGf (vpt->translate[3]);
+	OUT_RINGf (vpt->scale[0]);
+	OUT_RINGf (vpt->scale[1]);
+	OUT_RINGf (vpt->scale[2]);
+	OUT_RINGf (vpt->scale[3]);
+}
+
+static void
+nv30_set_vertex_buffer(struct pipe_context *pipe, unsigned index,
+		       const struct pipe_vertex_buffer *vb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->vtxbuf[index] = *vb;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+}
+
+static void
+nv30_set_vertex_element(struct pipe_context *pipe, unsigned index,
+			const struct pipe_vertex_element *ve)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->vtxelt[index] = *ve;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+}
+
+void
+nv30_init_state_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_blend_state = nv30_blend_state_create;
+	nv30->pipe.bind_blend_state = nv30_blend_state_bind;
+	nv30->pipe.delete_blend_state = nv30_blend_state_delete;
+
+	nv30->pipe.create_sampler_state = nv30_sampler_state_create;
+	nv30->pipe.bind_sampler_state = nv30_sampler_state_bind;
+	nv30->pipe.delete_sampler_state = nv30_sampler_state_delete;
+	nv30->pipe.set_sampler_texture = nv30_set_sampler_texture;
+
+	nv30->pipe.create_rasterizer_state = nv30_rasterizer_state_create;
+	nv30->pipe.bind_rasterizer_state = nv30_rasterizer_state_bind;
+	nv30->pipe.delete_rasterizer_state = nv30_rasterizer_state_delete;
+
+	nv30->pipe.create_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_create;
+	nv30->pipe.bind_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_bind;
+	nv30->pipe.delete_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_delete;
+
+	nv30->pipe.create_vs_state = nv30_vp_state_create;
+	nv30->pipe.bind_vs_state = nv30_vp_state_bind;
+	nv30->pipe.delete_vs_state = nv30_vp_state_delete;
+
+	nv30->pipe.create_fs_state = nv30_fp_state_create;
+	nv30->pipe.bind_fs_state = nv30_fp_state_bind;
+	nv30->pipe.delete_fs_state = nv30_fp_state_delete;
+
+	nv30->pipe.set_blend_color = nv30_set_blend_color;
+	nv30->pipe.set_clip_state = nv30_set_clip_state;
+	nv30->pipe.set_constant_buffer = nv30_set_constant_buffer;
+	nv30->pipe.set_framebuffer_state = nv30_set_framebuffer_state;
+	nv30->pipe.set_polygon_stipple = nv30_set_polygon_stipple;
+	nv30->pipe.set_scissor_state = nv30_set_scissor_state;
+	nv30->pipe.set_viewport_state = nv30_set_viewport_state;
+
+	nv30->pipe.set_vertex_buffer = nv30_set_vertex_buffer;
+	nv30->pipe.set_vertex_element = nv30_set_vertex_element;
+}
+
diff --git a/src/mesa/pipe/nv30/nv30_state.h b/src/mesa/pipe/nv30/nv30_state.h
new file mode 100644
index 0000000000..233600f69a
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_state.h
@@ -0,0 +1,147 @@
+#ifndef __NV30_STATE_H__
+#define __NV30_STATE_H__
+
+#include "pipe/p_state.h"
+
+struct nv30_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+	uint32_t b_eqn;
+
+	uint32_t l_enable;
+	uint32_t l_op;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv30_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv30_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+	uint32_t line_stipple_en;
+	uint32_t line_stipple;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	uint32_t poly_stipple_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+};
+
+struct nv30_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv30_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv30_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv30_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv30_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv30_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv30_fragment_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv30_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+struct nv30_stencil_push {
+	uint32_t enable;
+	uint32_t wmask;
+	uint32_t func;
+	uint32_t ref;
+	uint32_t vmask;
+	uint32_t fail;
+	uint32_t zfail;
+	uint32_t zpass;
+};
+
+struct nv30_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		struct nv30_stencil_push back;
+		struct nv30_stencil_push front;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv30_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/mesa/pipe/nv30/nv30_state_emit.c b/src/mesa/pipe/nv30/nv30_state_emit.c
new file mode 100644
index 0000000000..70b98836f0
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_state_emit.c
@@ -0,0 +1,83 @@
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+void
+nv30_emit_hw_state(struct nv30_context *nv30)
+{
+	int i;
+
+	if (nv30->dirty & NV30_NEW_FRAGPROG) {
+		nv30_fragprog_bind(nv30, nv30->fragprog.current);
+		/*XXX: clear NV30_NEW_FRAGPROG if no new program uploaded */
+	}
+
+	if (nv30->dirty_samplers || (nv30->dirty & NV30_NEW_FRAGPROG)) {
+		nv30_fragtex_bind(nv30);
+/*
+		BEGIN_RING(rankine, NV34TCL_TX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(rankine, NV34TCL_TX_CACHE_CTL, 1);
+		OUT_RING  (1);*/
+		nv30->dirty &= ~NV30_NEW_FRAGPROG;
+	}
+
+	if (nv30->dirty & NV30_NEW_VERTPROG) {
+		nv30_vertprog_bind(nv30, nv30->vertprog.current);
+		nv30->dirty &= ~NV30_NEW_VERTPROG;
+	}
+
+	nv30->dirty_samplers = 0;
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv30_vbo.c
+	 */
+
+	/* Render targets */
+	if (nv30->rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
+		BEGIN_RING(rankine, NV34TCL_DMA_COLOR0, 1);
+		OUT_RELOCo(nv30->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(rankine, NV34TCL_COLOR0_OFFSET, 1);
+		OUT_RELOCl(nv30->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+
+	if (nv30->rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		BEGIN_RING(rankine, NV34TCL_DMA_COLOR1, 1);
+		OUT_RELOCo(nv30->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(rankine, NV34TCL_COLOR1_OFFSET, 1);
+		OUT_RELOCl(nv30->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+
+	if (nv30->zeta) {
+		BEGIN_RING(rankine, NV34TCL_DMA_ZETA, 1);
+		OUT_RELOCo(nv30->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(rankine, NV34TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv30->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX allocate LMA */
+/*		BEGIN_RING(rankine, NV34TCL_LMA_DEPTH_OFFSET, 1);
+		OUT_RING(0);*/
+	}
+
+	/* Texture images */
+	for (i = 0; i < 16; i++) {
+		if (!(nv30->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(rankine, NV34TCL_TX_OFFSET(i), 2);
+		OUT_RELOCl(nv30->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(nv30->tex[i].buffer, nv30->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV34TCL_TX_FORMAT_DMA0,
+			   NV34TCL_TX_FORMAT_DMA1);
+	}
+
+	/* Fragment program */
+	BEGIN_RING(rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
+	OUT_RELOC (nv30->fragprog.active->buffer, 0, NOUVEAU_BO_VRAM |
+	           NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+		   NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+		   NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+}
+
diff --git a/src/mesa/pipe/nv30/nv30_surface.c b/src/mesa/pipe/nv30/nv30_surface.c
new file mode 100644
index 0000000000..31745e3d6e
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_surface.c
@@ -0,0 +1,136 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv30_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "pipe/util/p_tile.h"
+
+static boolean
+nv30_surface_format_supported(struct pipe_context *pipe,
+			      enum pipe_format format, uint type)
+{
+	switch (type) {
+	case PIPE_SURFACE:
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+		break;
+	case PIPE_TEXTURE:
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_U_L8:
+		case PIPE_FORMAT_U_A8:
+		case PIPE_FORMAT_U_I8:
+		case PIPE_FORMAT_U_A8_L8:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+		break;
+	default:
+		assert(0);
+	};
+
+	return FALSE;
+}
+
+static struct pipe_surface *
+nv30_get_tex_surface(struct pipe_context *pipe, struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice)
+{
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv30_miptree *nv30mt = (struct nv30_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = ws->surface_alloc(ws);
+	if (!ps)
+		return NULL;
+	pipe_buffer_reference(ws, &ps->buffer, nv30mt->buffer);
+	ps->format = pt->format;
+	ps->cpp = pt->cpp;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->pitch = nv30mt->level[level].pitch / ps->cpp;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv30mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv30mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv30mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv30_surface_copy(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, struct pipe_surface *src,
+		  unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+			   width, height);
+}
+
+static void
+nv30_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv30_init_surface_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.is_format_supported = nv30_surface_format_supported;
+	nv30->pipe.get_tex_surface = nv30_get_tex_surface;
+	nv30->pipe.surface_copy = nv30_surface_copy;
+	nv30->pipe.surface_fill = nv30_surface_fill;
+}
diff --git a/src/mesa/pipe/nv30/nv30_vbo.c b/src/mesa/pipe/nv30/nv30_vbo.c
new file mode 100644
index 0000000000..e6c50d3820
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_vbo.c
@@ -0,0 +1,406 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+#include "pipe/nouveau/nouveau_channel.h"
+#include "pipe/nouveau/nouveau_pushbuf.h"
+
+static INLINE int
+nv30_vbo_ncomp(uint format)
+{
+	int ncomp = 0;
+
+	if (pf_size_x(format)) ncomp++;
+	if (pf_size_y(format)) ncomp++;
+	if (pf_size_z(format)) ncomp++;
+	if (pf_size_w(format)) ncomp++;
+
+	return ncomp;
+}
+
+static INLINE int
+nv30_vbo_type(uint format)
+{
+	switch (pf_type(format)) {
+	case PIPE_FORMAT_TYPE_FLOAT:
+		return NV34TCL_VERTEX_ARRAY_FORMAT_TYPE_FLOAT;
+	case PIPE_FORMAT_TYPE_UNORM:
+		return NV34TCL_VERTEX_ARRAY_FORMAT_TYPE_UBYTE;
+	default:
+		assert(0);
+	}
+}
+
+static boolean
+nv30_vbo_static_attrib(struct nv30_context *nv30, int attrib,
+		       struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	int type, ncomp;
+	void *map;
+
+	type = nv30_vbo_type(ve->src_format);
+	ncomp = nv30_vbo_ncomp(ve->src_format);
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV34TCL_VERTEX_ARRAY_FORMAT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_ATTR_4F_X(attrib), 4);
+		switch (ncomp) {
+		case 4:
+			OUT_RINGf(v[0]);
+			OUT_RINGf(v[1]);
+			OUT_RINGf(v[2]);
+			OUT_RINGf(v[3]);
+			break;
+		case 3:
+			OUT_RINGf(v[0]);
+			OUT_RINGf(v[1]);
+			OUT_RINGf(v[2]);
+			OUT_RINGf(1.0);
+			break;
+		case 2:
+			OUT_RINGf(v[0]);
+			OUT_RINGf(v[1]);
+			OUT_RINGf(0.0);
+			OUT_RINGf(1.0);
+			break;
+		case 1:
+			OUT_RINGf(v[0]);
+			OUT_RINGf(0.0);
+			OUT_RINGf(0.0);
+			OUT_RINGf(1.0);
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+static void
+nv30_vbo_arrays_update(struct nv30_context *nv30)
+{
+	struct nv30_vertex_program *vp = nv30->vertprog.active;
+	uint32_t inputs, vtxfmt[16];
+	int hw, num_hw;
+
+	nv30->vb_enable = 0;
+
+	inputs = vp->ir;
+	for (hw = 0; hw < 16 && inputs; hw++) {
+		if (inputs & (1 << hw)) {
+			num_hw = hw;
+			inputs &= ~(1 << hw);
+		}
+	}
+	num_hw++;
+
+	inputs = vp->ir;
+	for (hw = 0; hw < num_hw; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+
+		if (!(inputs & (1 << hw))) {
+			vtxfmt[hw] = NV34TCL_VERTEX_ARRAY_FORMAT_TYPE_FLOAT;
+			continue;
+		}
+
+		ve = &nv30->vtxelt[hw];
+		vb = &nv30->vtxbuf[ve->vertex_buffer_index];
+
+		if (vb->pitch == 0) {
+			vtxfmt[hw] = NV34TCL_VERTEX_ARRAY_FORMAT_TYPE_FLOAT;
+			if (nv30_vbo_static_attrib(nv30, hw, ve, vb) == TRUE)
+				continue;
+		}
+
+		nv30->vb_enable |= (1 << hw);
+		nv30->vb[hw].delta = vb->buffer_offset + ve->src_offset;
+		nv30->vb[hw].buffer = vb->buffer;
+
+		vtxfmt[hw] = ((vb->pitch << NV34TCL_VERTEX_ARRAY_FORMAT_STRIDE_SHIFT) |
+			      (nv30_vbo_ncomp(ve->src_format) <<
+			       NV34TCL_VERTEX_ARRAY_FORMAT_SIZE_SHIFT) |
+			      nv30_vbo_type(ve->src_format));
+	}
+
+	BEGIN_RING(rankine, NV34TCL_VERTEX_ARRAY_FORMAT(0), num_hw);
+	OUT_RINGp (vtxfmt, num_hw);
+}
+
+static boolean
+nv30_vbo_validate_state(struct nv30_context *nv30,
+			struct pipe_buffer *ib, unsigned ib_format)
+{
+	unsigned inputs;
+
+	nv30_emit_hw_state(nv30);
+
+	if (nv30->dirty & NV30_NEW_ARRAYS) {
+		nv30_vbo_arrays_update(nv30);
+		nv30->dirty &= ~NV30_NEW_ARRAYS;
+	}
+
+	inputs = nv30->vb_enable;
+	while (inputs) {
+		unsigned a = ffs(inputs) - 1;
+
+		inputs &= ~(1 << a);
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BUFFER_ADDRESS(a), 1);
+		OUT_RELOC (nv30->vb[a].buffer, nv30->vb[a].delta,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_LOW |
+			   NOUVEAU_BO_OR | NOUVEAU_BO_RD, 0,
+			   NV34TCL_VERTEX_BUFFER_ADDRESS_DMA1);
+	}
+
+	if (ib) {
+		BEGIN_RING(rankine, NV40TCL_IDXBUF_ADDRESS, 2);
+		OUT_RELOCl(ib, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			   NOUVEAU_BO_RD);
+		OUT_RELOCd(ib, ib_format, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			   NOUVEAU_BO_RD | NOUVEAU_BO_OR,
+			   0, NV40TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	BEGIN_RING(rankine, 0x1710, 1);
+	OUT_RING  (0); /* vtx cache flush */
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
+		 unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned nr;
+
+	assert(nv30_vbo_validate_state(nv30, NULL, 0));
+
+	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING  (nvgl_primitive(mode));
+
+	nr = (count & 0xff);
+	if (nr) {
+		BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+		OUT_RING  (((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	nr = count >> 8;
+	while (nr) {
+		unsigned push = nr > 2047 ? 2047 : nr;
+
+		nr -= push;
+
+		BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+		while (push--) {
+			OUT_RING(((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+
+	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING  (0);
+
+	pipe->flush(pipe, 0);
+	return TRUE;
+}
+
+static INLINE void
+nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
+		       unsigned start, unsigned count)
+{
+	uint8_t *elts = (uint8_t *)ib + start;
+	int push, i;
+
+	if (count & 1) {
+		BEGIN_RING(rankine, NV40TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (elts[0]);
+		elts++; count--;
+	}
+
+	while (count) {
+		push = MIN2(count, 2046);
+
+		BEGIN_RING_NI(rankine, NV40TCL_VB_ELEMENT_U16, push);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((elts[i+1] << 16) | elts[i]);
+
+		count -= push;
+		elts  += push;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
+		       unsigned start, unsigned count)
+{
+	uint16_t *elts = (uint16_t *)ib + start;
+	int push, i;
+
+	if (count & 1) {
+		BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (elts[0]);
+		elts++; count--;
+	}
+
+	while (count) {
+		push = MIN2(count, 2046);
+
+		BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((elts[i+1] << 16) | elts[i]);
+
+		count -= push;
+		elts  += push;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
+		       unsigned start, unsigned count)
+{
+	uint32_t *elts = (uint32_t *)ib + start;
+	int push;
+
+	while (count) {
+		push = MIN2(count, 2047);
+
+		BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
+		OUT_RINGp    (elts, push);
+
+		count -= push;
+		elts  += push;
+	}
+}
+
+static boolean
+nv30_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	assert(nv30_vbo_validate_state(nv30, NULL, 0));
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib)
+		assert(0);
+
+	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING  (nvgl_primitive(mode));
+
+	switch (ib_size) {
+	case 1:
+		nv30_draw_elements_u08(nv30, map, start, count);
+		break;
+	case 2:
+		nv30_draw_elements_u16(nv30, map, start, count);
+		break;
+	case 4:
+		nv30_draw_elements_u32(nv30, map, start, count);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING  (0);
+
+	ws->buffer_unmap(ws, ib);
+
+	return TRUE;
+}
+
+static boolean
+nv30_draw_elements_vbo(struct pipe_context *pipe,
+		       struct pipe_buffer *ib, unsigned ib_size,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned nr, type;
+
+	switch (ib_size) {
+	case 2:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		assert(0);
+	}
+
+	assert(nv30_vbo_validate_state(nv30, ib, type));
+
+	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING  (nvgl_primitive(mode));
+
+	nr = (count & 0xff);
+	if (nr) {
+		BEGIN_RING(rankine, NV40TCL_VB_INDEX_BATCH, 1);
+		OUT_RING  (((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	nr = count >> 8;
+	while (nr) {
+		unsigned push = nr > 2047 ? 2047 : nr;
+
+		nr -= push;
+
+		BEGIN_RING_NI(rankine, NV40TCL_VB_INDEX_BATCH, push);
+		while (push--) {
+			OUT_RING(((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+
+	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING  (0);
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	if (indexSize != 1) {
+		nv30_draw_elements_vbo(pipe, indexBuffer, indexSize,
+				       mode, start, count);
+	} else {
+		nv30_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0);
+	return TRUE;
+}
+
+
diff --git a/src/mesa/pipe/nv30/nv30_vertprog.c b/src/mesa/pipe/nv30/nv30_vertprog.c
new file mode 100644
index 0000000000..b712049fa7
--- /dev/null
+++ b/src/mesa/pipe/nv30/nv30_vertprog.c
@@ -0,0 +1,778 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+
+struct nv30_vpc {
+	struct nv30_vertex_program *vp;
+
+	struct nv30_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv30_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv30_sreg
+temp(struct nv30_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv30_sreg
+constant(struct nv30_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	struct nv30_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv30_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv30_vpc *vpc, uint32_t *hw, int pos, struct nv30_sreg src)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv30_vpc *vpc, uint32_t *hw, int slot, struct nv30_sreg dst)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv30_vp_arith(struct nv30_vpc *vpc, int slot, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1,
+	      struct nv30_sreg s2)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv30_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv30_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv30_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv30_sreg src[3], dst, tmp;
+	struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->u.DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_prepare(struct nv30_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe->tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = calloc(nr_imm, sizeof(struct nv30_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+void
+nv30_vertprog_translate(struct nv30_context *nv30,
+			struct nv30_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_vpc *vpc = NULL;
+
+	vpc = calloc(1, sizeof(struct nv30_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv30_vertprog_prepare(vpc)) {
+		free(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe->tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	free(vpc);
+}
+
+void
+nv30_vertprog_bind(struct nv30_context *nv30, struct nv30_vertex_program *vp)
+{ 
+	struct nouveau_winsys *nvws = nv30->nvws;
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv30_vertprog_translate(nv30, vp);
+		if (!vp->translated)
+			assert(0);
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv30->vertprog.exec_heap;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv30->vertprog.data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (nv30->vertprog.constant_buf) {
+			map = ws->buffer_map(ws, nv30->vertprog.constant_buf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv30_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (map) {
+			ws->buffer_unmap(ws, nv30->vertprog.constant_buf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	BEGIN_RING(rankine, NV34TCL_VP_START_FROM_ID, 1);
+	OUT_RING  (vp->exec->start);
+	BEGIN_RING(rankine, NV34TCL_VP_ATTRIB_EN, 2);
+	OUT_RING  (vp->ir);
+	OUT_RING  (vp->or);
+
+	nv30->vertprog.active = vp;
+}
+
+void
+nv30_vertprog_destroy(struct nv30_context *nv30, struct nv30_vertex_program *vp)
+{
+	if (vp->nr_consts)
+		free(vp->consts);
+	if (vp->nr_insns)
+		free(vp->insns);
+}
+
-- 
cgit v1.2.3


From 903521a6c031757d63b48129d08ba043d183dbdc Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Fri, 15 Feb 2008 02:41:34 +0100
Subject: nouveau: oops and make nouveau winsys build by default

---
 configs/linux-dri                            | 2 +-
 src/mesa/drivers/dri/nouveau_winsys/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/linux-dri b/configs/linux-dri
index 936fce9982..494b0aab8e 100644
--- a/configs/linux-dri
+++ b/configs/linux-dri
@@ -66,4 +66,4 @@ WINDOW_SYSTEM=dri
 
 # gamma are missing because they have not been converted to use the new
 # interface.
-DRI_DIRS = intel_winsys 
+DRI_DIRS = intel_winsys nouveau_winsys
diff --git a/src/mesa/drivers/dri/nouveau_winsys/Makefile b/src/mesa/drivers/dri/nouveau_winsys/Makefile
index 59ba561eb9..98ec5a79f5 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/Makefile
+++ b/src/mesa/drivers/dri/nouveau_winsys/Makefile
@@ -8,7 +8,7 @@ MINIGLX_SOURCES =
 
 PIPE_DRIVERS = \
 	$(TOP)/src/mesa/pipe/softpipe/libsoftpipe.a \
-	$(TOP)/src/mesa/pipe/nv40/libnv30.a \
+	$(TOP)/src/mesa/pipe/nv30/libnv30.a \
 	$(TOP)/src/mesa/pipe/nv40/libnv40.a \
 	$(TOP)/src/mesa/pipe/nv50/libnv50.a
 
-- 
cgit v1.2.3


From a7872d4c41a692a81cc54cb5eaaab04308604bdf Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 28 Jan 2008 15:48:51 +0900
Subject: Clone vf module.

---
 src/mesa/pipe/draw/draw_vf.c         | 374 +++++++++++++
 src/mesa/pipe/draw/draw_vf.h         | 249 +++++++++
 src/mesa/pipe/draw/draw_vf_generic.c | 983 +++++++++++++++++++++++++++++++++++
 src/mesa/pipe/draw/draw_vf_sse.c     | 664 +++++++++++++++++++++++
 src/mesa/sources                     |   3 +
 5 files changed, 2273 insertions(+)
 create mode 100644 src/mesa/pipe/draw/draw_vf.c
 create mode 100644 src/mesa/pipe/draw/draw_vf.h
 create mode 100644 src/mesa/pipe/draw/draw_vf_generic.c
 create mode 100644 src/mesa/pipe/draw/draw_vf_sse.c

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
new file mode 100644
index 0000000000..f758460b5f
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "context.h"
+#include "colormac.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+#define DBG 0
+
+
+
+static boolean match_fastpath( struct draw_vertex_fetch *vf,
+				 const struct draw_vf_fastpath *fp)
+{
+   unsigned j;
+
+   if (vf->attr_count != fp->attr_count) 
+      return FALSE;
+
+   for (j = 0; j < vf->attr_count; j++) 
+      if (vf->attr[j].format != fp->attr[j].format ||
+	  vf->attr[j].inputsize != fp->attr[j].size ||
+	  vf->attr[j].vertoffset != fp->attr[j].offset) 
+	 return FALSE;
+      
+   if (fp->match_strides) {
+      if (vf->vertex_stride != fp->vertex_stride)
+	 return FALSE;
+
+      for (j = 0; j < vf->attr_count; j++) 
+	 if (vf->attr[j].inputstride != fp->attr[j].stride) 
+	    return FALSE;
+   }
+   
+   return TRUE;
+}
+
+static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp = vf->fastpath;
+
+   for ( ; fp ; fp = fp->next) {
+      if (match_fastpath(vf, fp)) {
+         vf->emit = fp->func;
+	 return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
+			     boolean match_strides )
+{
+   struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
+   unsigned i;
+
+   fastpath->vertex_stride = vf->vertex_stride;
+   fastpath->attr_count = vf->attr_count;
+   fastpath->match_strides = match_strides;
+   fastpath->func = vf->emit;
+   fastpath->attr = (struct draw_vf_attr_type *)
+      _mesa_malloc(vf->attr_count * sizeof(fastpath->attr[0]));
+
+   for (i = 0; i < vf->attr_count; i++) {
+      fastpath->attr[i].format = vf->attr[i].format;
+      fastpath->attr[i].stride = vf->attr[i].inputstride;
+      fastpath->attr[i].size = vf->attr[i].inputsize;
+      fastpath->attr[i].offset = vf->attr[i].vertoffset;
+   }
+
+   fastpath->next = vf->fastpath;
+   vf->fastpath = fastpath;
+}
+
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct draw_vertex_fetch *vf, 
+			      unsigned count, 
+			      uint8_t *dest)
+{
+   vf->emit = NULL;
+   
+   /* Does this match an existing (hardwired, codegen or known-bad)
+    * fastpath?
+    */
+   if (search_fastpath_emit(vf)) {
+      /* Use this result.  If it is null, then it is already known
+       * that the current state will fail for codegen and there is no
+       * point trying again.
+       */
+   }
+   else if (vf->codegen_emit) {
+      vf->codegen_emit( vf );
+   }
+
+   if (!vf->emit) {
+      draw_vf_generate_hardwired_emit(vf);
+   }
+
+   /* Otherwise use the generic version:
+    */
+   if (!vf->emit)
+      vf->emit = draw_vf_generic_emit;
+
+   vf->emit( vf, count, dest );
+}
+
+
+
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+
+unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+				 const struct draw_vf_attr_map *map,
+				 unsigned nr, 
+				 unsigned vertex_stride )
+{
+   unsigned offset = 0;
+   unsigned i, j;
+
+   assert(nr < DRAW_VF_ATTRIB_MAX);
+
+   memset(vf->lookup, 0, sizeof(vf->lookup));
+
+   for (j = 0, i = 0; i < nr; i++) {
+      const unsigned format = map[i].format;
+      if (format == EMIT_PAD) {
+	 if (DBG)
+	    _mesa_printf("%d: pad %d, offset %d\n", i,  
+			 map[i].offset, offset);  
+
+	 offset += map[i].offset;
+
+      }
+      else {
+	 assert(vf->lookup[map[i].attrib] == 0);
+	 vf->lookup[map[i].attrib] = &vf->attr[j];
+
+	 vf->attr[j].attrib = map[i].attrib;
+	 vf->attr[j].format = format;
+	 vf->attr[j].insert = draw_vf_format_info[format].insert;
+	 vf->attr[j].extract = draw_vf_format_info[format].extract;
+	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
+	 vf->attr[j].vertoffset = offset;
+	 
+	 if (DBG)
+	    _mesa_printf("%d: %s, offset %d\n", i,  
+			 draw_vf_format_info[format].name,
+			 vf->attr[j].vertoffset);   
+
+	 offset += draw_vf_format_info[format].attrsize;
+	 j++;
+      }
+   }
+
+   vf->attr_count = j;
+   vf->vertex_stride = vertex_stride ? vertex_stride : offset;
+   vf->emit = choose_emit_func;
+
+   assert(vf->vertex_stride >= offset);
+   return vf->vertex_stride;
+}
+
+
+
+void draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
+		       const float *viewport )
+{
+   assert(vf->allow_viewport_emits);
+
+   /* scale */
+   vf->vp[0] = viewport[MAT_SX];
+   vf->vp[1] = viewport[MAT_SY];
+   vf->vp[2] = viewport[MAT_SZ];
+   vf->vp[3] = 1.0;
+
+   /* translate */
+   vf->vp[4] = viewport[MAT_TX];
+   vf->vp[5] = viewport[MAT_TY];
+   vf->vp[6] = viewport[MAT_TZ];
+   vf->vp[7] = 0.0;
+}
+
+void draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
+				const float *scale,
+				const float *translate )
+{
+   assert(vf->allow_viewport_emits);
+
+   vf->vp[0] = scale[0];
+   vf->vp[1] = scale[1];
+   vf->vp[2] = scale[2];
+   vf->vp[3] = scale[3];
+
+   vf->vp[4] = translate[0];
+   vf->vp[5] = translate[1];
+   vf->vp[6] = translate[2];
+   vf->vp[7] = translate[3];
+}
+
+
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const sources[],
+		     unsigned start )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      const GLvector4f *vptr = sources[a[j].attrib];
+      
+      if ((a[j].inputstride != vptr->stride) ||
+	  (a[j].inputsize != vptr->size))
+	 vf->emit = choose_emit_func;
+      
+      a[j].inputstride = vptr->stride;
+      a[j].inputsize = vptr->size;
+      a[j].do_insert = a[j].insert[vptr->size - 1]; 
+      a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
+   }
+}
+
+
+
+/* Emit count VB vertices to dest.  
+ */
+void draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
+		       unsigned count,
+		       void *dest )
+{
+   vf->emit( vf, count, (uint8_t*) dest );	
+}
+
+
+/* Extract a named attribute from a hardware vertex.  Will have to
+ * reverse any viewport transformation, swizzling or other conversions
+ * which may have been applied.
+ *
+ * This is mainly required for on-the-fly vertex translations to
+ * swrast format.
+ */
+void draw_vf_get_attr( struct draw_vertex_fetch *vf,
+		  const void *vertex,
+		  GLenum attr, 
+		  const float *dflt,
+		  float *dest )
+{
+   const struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   unsigned j;
+
+   for (j = 0; j < attr_count; j++) {
+      if (a[j].attrib == attr) {
+	 a[j].extract( &a[j], dest, (uint8_t *)vertex + a[j].vertoffset );
+	 return;
+      }
+   }
+
+   /* Else return the value from ctx->Current.
+    */
+   _mesa_memcpy( dest, dflt, 4*sizeof(float));
+}
+
+
+
+
+struct draw_vertex_fetch *draw_vf_create( boolean allow_viewport_emits )
+{
+   struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
+   unsigned i;
+
+   for (i = 0; i < DRAW_VF_ATTRIB_MAX; i++)
+      vf->attr[i].vf = vf;
+
+   vf->allow_viewport_emits = allow_viewport_emits;
+
+   switch(CHAN_TYPE) {
+   case GL_UNSIGNED_BYTE:
+      vf->chan_scale[0] = 255.0;
+      vf->chan_scale[1] = 255.0;
+      vf->chan_scale[2] = 255.0;
+      vf->chan_scale[3] = 255.0;
+      break;
+   case GL_UNSIGNED_SHORT:
+      vf->chan_scale[0] = 65535.0;
+      vf->chan_scale[1] = 65535.0;
+      vf->chan_scale[2] = 65535.0;
+      vf->chan_scale[3] = 65535.0;
+      break;
+   default:
+      vf->chan_scale[0] = 1.0;
+      vf->chan_scale[1] = 1.0;
+      vf->chan_scale[2] = 1.0;
+      vf->chan_scale[3] = 1.0;
+      break;
+   }
+
+   vf->identity[0] = 0.0;
+   vf->identity[1] = 0.0;
+   vf->identity[2] = 0.0;
+   vf->identity[3] = 1.0;
+
+   vf->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+   if (!_mesa_getenv("MESA_NO_CODEGEN"))
+      vf->codegen_emit = draw_vf_generate_sse_emit;
+#endif
+
+   return vf;
+}
+
+
+void draw_vf_destroy( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp, *tmp;
+
+   for (fp = vf->fastpath ; fp ; fp = tmp) {
+      tmp = fp->next;
+      FREE(fp->attr);
+
+      /* KW: At the moment, fp->func is constrained to be allocated by
+       * _mesa_exec_alloc(), as the hardwired fastpaths in
+       * t_vertex_generic.c are handled specially.  It would be nice
+       * to unify them, but this probably won't change until this
+       * module gets another overhaul.
+       */
+      _mesa_exec_free((void *) fp->func);
+      FREE(fp);
+   }
+   
+   vf->fastpath = NULL;
+   FREE(vf);
+}
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
new file mode 100644
index 0000000000..279570aad5
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VF_H
+#define DRAW_VF_H
+
+
+#include "pipe/p_compiler.h"
+#include "math/m_vector.h"
+
+
+enum {
+   DRAW_VF_ATTRIB_POS = 0,
+   DRAW_VF_ATTRIB_WEIGHT = 1,
+   DRAW_VF_ATTRIB_NORMAL = 2,
+   DRAW_VF_ATTRIB_COLOR0 = 3,
+   DRAW_VF_ATTRIB_COLOR1 = 4,
+   DRAW_VF_ATTRIB_FOG = 5,
+   DRAW_VF_ATTRIB_COLOR_INDEX = 6,
+   DRAW_VF_ATTRIB_EDGEFLAG = 7,
+   DRAW_VF_ATTRIB_TEX0 = 8,
+   DRAW_VF_ATTRIB_TEX1 = 9,
+   DRAW_VF_ATTRIB_TEX2 = 10,
+   DRAW_VF_ATTRIB_TEX3 = 11,
+   DRAW_VF_ATTRIB_TEX4 = 12,
+   DRAW_VF_ATTRIB_TEX5 = 13,
+   DRAW_VF_ATTRIB_TEX6 = 14,
+   DRAW_VF_ATTRIB_TEX7 = 15,
+   DRAW_VF_ATTRIB_VAR0 = 16,
+   DRAW_VF_ATTRIB_VAR1 = 17,
+   DRAW_VF_ATTRIB_VAR2 = 18,
+   DRAW_VF_ATTRIB_VAR3 = 19,
+   DRAW_VF_ATTRIB_VAR4 = 20,
+   DRAW_VF_ATTRIB_VAR5 = 21,
+   DRAW_VF_ATTRIB_VAR6 = 22,
+   DRAW_VF_ATTRIB_VAR7 = 23,
+   DRAW_VF_ATTRIB_POINTSIZE = 24,
+   DRAW_VF_ATTRIB_BFC0 = 25,
+   DRAW_VF_ATTRIB_BFC1 = 26,
+   DRAW_VF_ATTRIB_CLIP_POS = 27,
+   DRAW_VF_ATTRIB_VERTEX_HEADER = 28,
+   DRAW_VF_ATTRIB_MAX = 29
+};
+
+enum draw_vf_attr_format {
+   EMIT_1F,
+   EMIT_2F,
+   EMIT_3F,
+   EMIT_4F,
+   EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
+   EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
+   EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
+   EMIT_3F_XYW,			/**< for projective texture */
+   EMIT_1UB_1F,			/**< for fog coordinate */
+   EMIT_3UB_3F_RGB,		/**< for specular color */
+   EMIT_3UB_3F_BGR,		/**< for specular color */
+   EMIT_4UB_4F_RGBA,		/**< for color */
+   EMIT_4UB_4F_BGRA,		/**< for color */
+   EMIT_4UB_4F_ARGB,		/**< for color */
+   EMIT_4UB_4F_ABGR,		/**< for color */
+   EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
+   EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   EMIT_MAX
+};
+
+struct draw_vf_attr_map {
+   unsigned attrib;
+   enum draw_vf_attr_format format;
+   unsigned offset;
+};
+
+struct draw_vertex_fetch;
+
+
+void 
+draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
+                       const float *viewport );
+
+void 
+draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
+				const float *scale,
+				const float *translate );
+
+unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride );
+
+void 
+draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const attrib[],
+		     unsigned start ); 
+
+void 
+draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
+		       unsigned count,
+		       void *dest );
+
+void 
+draw_vf_get_attr( struct draw_vertex_fetch *vf,
+		  const void *vertex,
+		  GLenum attr, 
+		  const float *dflt,
+		  float *dest );
+
+struct draw_vertex_fetch *
+draw_vf_create( boolean allow_viewport_emits );
+
+void 
+draw_vf_destroy( struct draw_vertex_fetch *vf );
+
+
+
+/***********************************************************************
+ * Internal functions and structs:
+ */
+
+struct draw_vf_attr;
+
+typedef void (*draw_vf_extract_func)( const struct draw_vf_attr *a, 
+				      float *out, 
+				      const uint8_t *v );
+
+typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a, 
+				     uint8_t *v, 
+				     const float *in );
+
+typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
+      				   unsigned count, 
+      				   uint8_t *dest );
+
+
+
+/**
+ * Describes how to convert/move a vertex attribute from a vertex
+ * array to a vertex structure.
+ */
+struct draw_vf_attr
+{
+   struct draw_vertex_fetch *vf;
+
+   unsigned format;
+   unsigned inputsize;
+   unsigned inputstride;
+   unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+
+   unsigned attrib;          /**< which vertex attrib (0=position, etc) */
+   unsigned vertattrsize;    /**< size of the attribute in bytes */
+
+   uint8_t *inputptr;
+   const draw_vf_insert_func *insert;
+   draw_vf_insert_func do_insert;
+   draw_vf_extract_func extract;
+};
+
+struct draw_vertex_fetch
+{
+   struct draw_vf_attr attr[DRAW_VF_ATTRIB_MAX];
+   unsigned attr_count;
+   unsigned vertex_stride;
+
+   struct draw_vf_attr *lookup[DRAW_VF_ATTRIB_MAX];
+   
+   draw_vf_emit_func emit;
+
+   /* Parameters and constants for codegen:
+    */
+   boolean allow_viewport_emits;
+   float vp[8];		
+   float chan_scale[4];
+   float identity[4];
+
+   struct draw_vf_fastpath *fastpath;
+   
+   void (*codegen_emit)( struct draw_vertex_fetch *vf );
+};
+
+
+struct draw_vf_attr_type {
+   unsigned format;
+   unsigned size;
+   unsigned stride;
+   unsigned offset;
+};
+
+struct draw_vf_fastpath {
+   unsigned vertex_stride;
+   unsigned attr_count;
+   boolean match_strides;
+
+   struct draw_vf_attr_type *attr;
+
+   draw_vf_emit_func func;
+   struct draw_vf_fastpath *next;
+};
+
+
+void 
+draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
+                           boolean match_strides );
+
+void 
+draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+                      unsigned count,
+                      uint8_t *v );
+
+void 
+draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
+
+void 
+draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
+
+
+struct draw_vf_format_info {
+   const char *name;
+   draw_vf_extract_func extract;
+   draw_vf_insert_func insert[4];
+   const unsigned attrsize;
+};
+
+const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX];
+
+
+#endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
new file mode 100644
index 0000000000..19e6c587e5
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -0,0 +1,983 @@
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "context.h"
+#include "colormac.h"
+#include "simple_list.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+
+/*
+ * These functions take the NDC coordinates pointed to by 'in', apply the
+ * NDC->Viewport mapping and store the results at 'v'.
+ */
+
+static INLINE void insert_4f_viewport_4( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] =                    trans[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] =                    trans[1];
+   out[2] =                    trans[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+}
+
+static INLINE void insert_3f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+}
+
+static INLINE void insert_3f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] =                    trans[1];
+   out[2] =                    trans[2];
+}
+
+static INLINE void insert_2f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+}
+
+static INLINE void insert_2f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = trans[1];
+}
+
+
+/*
+ * These functions do the same as above, except for the viewport mapping.
+ */
+
+static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+   _mesa_exit(1);
+}
+
+static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+
+   out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4chan_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[3], in[3]);
+}
+
+static INLINE void insert_4chan_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4chan_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   c[2] = 0;
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4chan_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   c[1] = 0;
+   c[2] = 0;
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   v[2] = 0x00;
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   v[2] = 0x00;
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v, 
+				    const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+/***********************************************************************
+ * Functions to perform the reverse operations to the above, for
+ * swrast translation and clip-interpolation.
+ * 
+ * Currently always extracts a full 4 floats.
+ */
+
+static void extract_4f_viewport( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   /* Although included for completeness, the position coordinate is
+    * usually handled differently during clipping.
+    */
+   out[0] = (in[0] - trans[0]) / scale[0];
+   out[1] = (in[1] - trans[1]) / scale[1];
+   out[2] = (in[2] - trans[2]) / scale[2];
+   out[3] = in[3];
+}
+
+static void extract_3f_viewport( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = (in[0] - trans[0]) / scale[0];
+   out[1] = (in[1] - trans[1]) / scale[1];
+   out[2] = (in[2] - trans[2]) / scale[2];
+   out[3] = 1;
+}
+
+
+static void extract_2f_viewport( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = (in[0] - trans[0]) / scale[0];
+   out[1] = (in[1] - trans[1]) / scale[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+
+static void extract_4f( const struct draw_vf_attr *a, float *out, const uint8_t *v  )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static void extract_3f_xyw( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = in[2];
+}
+
+
+static void extract_3f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+
+static void extract_2f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static void extract_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static void extract_4chan_4f_rgba( const struct draw_vf_attr *a, float *out, 
+				   const uint8_t *v )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+
+   out[0] = CHAN_TO_FLOAT(c[0]);
+   out[1] = CHAN_TO_FLOAT(c[1]);
+   out[2] = CHAN_TO_FLOAT(c[2]);
+   out[3] = CHAN_TO_FLOAT(c[3]);
+}
+
+static void extract_4ub_4f_rgba( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[2] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_bgra( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[2] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[0] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_argb( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[3] = UBYTE_TO_FLOAT(v[0]);
+   out[0] = UBYTE_TO_FLOAT(v[1]);
+   out[1] = UBYTE_TO_FLOAT(v[2]);
+   out[2] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_abgr( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[3] = UBYTE_TO_FLOAT(v[0]);
+   out[2] = UBYTE_TO_FLOAT(v[1]);
+   out[1] = UBYTE_TO_FLOAT(v[2]);
+   out[0] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_3ub_3f_rgb( const struct draw_vf_attr *a, float *out, 
+				const uint8_t *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[2] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = 1;
+}
+
+static void extract_3ub_3f_bgr( const struct draw_vf_attr *a, float *out, 
+				const uint8_t *v )
+{
+   (void) a;
+   out[2] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[0] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = 1;
+}
+
+static void extract_1ub_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+
+const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX] = 
+{
+   { "1f",
+     extract_1f,
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float) },
+
+   { "2f",
+     extract_2f,
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float) },
+
+   { "3f",
+     extract_3f,
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float) },
+
+   { "4f",
+     extract_4f,
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float) },
+
+   { "2f_viewport",
+     extract_2f_viewport,
+     { insert_2f_viewport_1, insert_2f_viewport_2, insert_2f_viewport_2,
+       insert_2f_viewport_2 },
+     2 * sizeof(float) },
+
+   { "3f_viewport",
+     extract_3f_viewport,
+     { insert_3f_viewport_1, insert_3f_viewport_2, insert_3f_viewport_3,
+       insert_3f_viewport_3 },
+     3 * sizeof(float) },
+
+   { "4f_viewport",
+     extract_4f_viewport,
+     { insert_4f_viewport_1, insert_4f_viewport_2, insert_4f_viewport_3,
+       insert_4f_viewport_4 }, 
+     4 * sizeof(float) },
+
+   { "3f_xyw",
+     extract_3f_xyw,
+     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
+       insert_3f_xyw_4 },
+     3 * sizeof(float) },
+
+   { "1ub_1f",
+     extract_1ub_1f,
+     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+     sizeof(uint8_t) },
+
+   { "3ub_3f_rgb",
+     extract_3ub_3f_rgb,
+     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+       insert_3ub_3f_rgb_3 },
+     3 * sizeof(uint8_t) },
+
+   { "3ub_3f_bgr",
+     extract_3ub_3f_bgr,
+     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+       insert_3ub_3f_bgr_3 },
+     3 * sizeof(uint8_t) },
+
+   { "4ub_4f_rgba",
+     extract_4ub_4f_rgba,
+     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
+       insert_4ub_4f_rgba_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4ub_4f_bgra",
+     extract_4ub_4f_bgra,
+     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+       insert_4ub_4f_bgra_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4ub_4f_argb",
+     extract_4ub_4f_argb,
+     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+       insert_4ub_4f_argb_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4ub_4f_abgr",
+     extract_4ub_4f_abgr,
+     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+       insert_4ub_4f_abgr_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4chan_4f_rgba",
+     extract_4chan_4f_rgba,
+     { insert_4chan_4f_rgba_1, insert_4chan_4f_rgba_2, insert_4chan_4f_rgba_3,
+       insert_4chan_4f_rgba_4 },
+     4 * sizeof(GLchan) },
+
+   { "pad",
+     NULL,
+     { NULL, NULL, NULL, NULL },
+     0 }
+
+};
+
+
+
+    
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
+static void NAME( struct draw_vertex_fetch *vf,				\
+		  unsigned count,						\
+		  uint8_t *v )						\
+{									\
+   struct draw_vf_attr *a = vf->attr;				\
+   unsigned i;								\
+									\
+   for (i = 0 ; i < count ; i++, v += vf->vertex_stride) {		\
+      if (NR > 0) {							\
+	 F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr );	\
+	 a[0].inputptr += a[0].inputstride;				\
+      }									\
+      									\
+      if (NR > 1) {							\
+	 F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr );	\
+	 a[1].inputptr += a[1].inputstride;				\
+      }									\
+      									\
+      if (NR > 2) {							\
+	 F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr );	\
+	 a[2].inputptr += a[2].inputstride;				\
+      }									\
+      									\
+      if (NR > 3) {							\
+	 F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr );	\
+	 a[3].inputptr += a[3].inputstride;				\
+      }									\
+									\
+      if (NR > 4) {							\
+	 F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr );	\
+	 a[4].inputptr += a[4].inputstride;				\
+      }									\
+   }									\
+}
+
+   
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+				  insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+				      insert_null, NAME)
+   
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+				          insert_null, NAME)
+   
+
+EMIT2(insert_3f_viewport_3, insert_4ub_4f_rgba_4, emit_viewport3_rgba4)
+EMIT2(insert_3f_viewport_3, insert_4ub_4f_bgra_4, emit_viewport3_bgra4)
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_viewport4_rgba4_st2)
+EMIT3(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2,  emit_viewport4_bgra4_st2)
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_viewport4_rgba4_st2_st2)
+EMIT4(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2, insert_2f_2,  emit_viewport4_bgra4_st2_st2)
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
+{
+   draw_vf_emit_func func = NULL;
+
+   /* Does it fit a hardwired fastpath?  Help! this is growing out of
+    * control!
+    */
+   switch (vf->attr_count) {
+   case 2:
+      if (vf->attr[0].do_insert == insert_3f_viewport_3) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4) 
+	    func = emit_viewport3_bgra4;
+	 else if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) 
+	    func = emit_viewport3_rgba4;
+      }
+      else if (vf->attr[0].do_insert == insert_3f_3 &&
+	       vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ 	 func = emit_xyz3_rgba4; 
+      }
+      break;
+   case 3:
+      if (vf->attr[2].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
+	       func = emit_viewport4_rgba4_st2;
+	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2;
+	 }
+	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
+		  vf->attr[0].do_insert == insert_4f_viewport_4)
+	    func = emit_viewport4_bgra4_st2;
+      }
+      break;
+   case 4:
+      if (vf->attr[2].do_insert == insert_2f_2 &&
+	  vf->attr[3].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
+	       func = emit_viewport4_rgba4_st2_st2;
+	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2_st2;
+	 }
+	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
+		  vf->attr[0].do_insert == insert_4f_viewport_4)
+	    func = emit_viewport4_bgra4_st2_st2;
+      }
+      break;
+   }
+
+   vf->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+		      unsigned count,
+		      uint8_t *v )
+{
+   struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   const unsigned stride = vf->vertex_stride;
+   unsigned i, j;
+
+   for (i = 0 ; i < count ; i++, v += stride) {
+      for (j = 0; j < attr_count; j++) {
+	 float *in = (float *)a[j].inputptr;
+	 a[j].inputptr += a[j].inputstride;
+	 a[j].do_insert( &a[j], v + a[j].vertoffset, in );
+      }
+   }
+}
+
+
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
new file mode 100644
index 0000000000..2cf3a45ff9
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "colormac.h"
+#include "simple_list.h"
+#include "enums.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct x86_program {
+   struct x86_function func;
+
+   struct draw_vertex_fetch *vf;
+   boolean inputs_safe;
+   boolean outputs_safe;
+   boolean have_sse2;
+   
+   struct x86_reg identity;
+   struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+   return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Initialize from identity, then pull in low two words:
+    */
+   sse_movups(&p->func, dest, get_identity(p));
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Pull in low word, then swizzle in identity */
+   sse_movss(&p->func, dest, arg0);
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Over-reads by 1 dword - potential SEGV if input is a vertex
+    * array.
+    */
+   if (p->inputs_safe) {
+      sse_movups(&p->func, dest, arg0);
+   } 
+   else {
+      /* c 0 0 0
+       * c c c c
+       * a b c c 
+       */
+      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+      sse_movlps(&p->func, dest, arg0);
+   }
+}
+
+static void emit_load3f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 ) = {
+   { emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1 },
+
+   { emit_load2f_1, 
+     emit_load2f_2, 
+     emit_load2f_2, 
+     emit_load2f_2 },
+
+   { emit_load3f_1, 
+     emit_load3f_2, 
+     emit_load3f_3, 
+     emit_load3f_3 },
+
+   { emit_load4f_1, 
+     emit_load4f_2, 
+     emit_load4f_3, 
+     emit_load4f_4 } 
+};
+
+static void emit_load( struct x86_program *p,
+		       struct x86_reg dest,
+		       unsigned sz,
+		       struct x86_reg src,
+		       unsigned src_sz)
+{
+   load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p, 			   
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   if (p->outputs_safe) {
+      /* Emit the extra dword anyway.  This may hurt writecombining,
+       * may cause other problems.
+       */
+      sse_movups(&p->func, dest, arg0);
+   }
+   else {
+      /* Alternate strategy - emit two, shuffle, emit one.
+       */
+      sse_movlps(&p->func, dest, arg0);
+      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+   }
+}
+
+static void emit_store2f( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p, 
+			 struct x86_reg dest,
+			 struct x86_reg arg0 ) = 
+{
+   emit_store1f, 
+   emit_store2f, 
+   emit_store3f, 
+   emit_store4f 
+};
+
+static void emit_store( struct x86_program *p,
+			struct x86_reg dest,
+			unsigned sz,
+			struct x86_reg temp )
+
+{
+   store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+				 struct x86_reg dest,
+				 struct x86_reg temp )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(&p->func, temp, p->chan0);
+
+   if (p->have_sse2) {
+      sse2_cvtps2dq(&p->func, temp, temp);
+      sse2_packssdw(&p->func, temp, temp);
+      sse2_packuswb(&p->func, temp, temp);
+      sse_movss(&p->func, dest, temp);
+   }
+   else {
+      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+      sse_cvtps2pi(&p->func, mmx0, temp);
+      sse_movhlps(&p->func, temp, temp);
+      sse_cvtps2pi(&p->func, mmx1, temp);
+      mmx_packssdw(&p->func, mmx0, mmx1);
+      mmx_packuswb(&p->func, mmx0, mmx0);
+      mmx_movd(&p->func, dest, mmx0);
+   }
+}
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here.  Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+   /* Load current a[j].inputptr
+    */
+   x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   if (a->inputstride) {
+      struct draw_vertex_fetch *vf = p->vf;
+      struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+      /* add a[j].inputstride (hardcoded value - could just as easily
+       * pull the stride value from memory each time).
+       */
+      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+      
+      /* save new value of a[j].inputptr 
+       */
+      x86_mov(&p->func, ptr_to_src, srcREG);
+   }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct x86_program *p )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   unsigned j = 0;
+
+   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
+   struct x86_reg temp = x86_make_reg(file_XMM, 0);
+   struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
+   struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
+   uint8_t *fixup, *label;
+
+   /* Push a few regs?
+    */
+   x86_push(&p->func, countEBP);
+   x86_push(&p->func, vfESI);
+
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(&p->func, srcECX, srcECX);
+   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+   x86_cmp(&p->func, countEBP, srcECX);
+   fixup = x86_jcc_forward(&p->func, cc_E);
+
+   /* Initialize destination register. 
+    */
+   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+   /* Move argument 1 (vf) into a reg:
+    */
+   x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
+
+   
+   /* Possibly load vp0, vp1 for viewport calcs:
+    */
+   if (vf->allow_viewport_emits) {
+      sse_movups(&p->func, vp0, x86_make_disp(vfESI, get_offset(vf, &vf->vp[0])));
+      sse_movups(&p->func, vp1, x86_make_disp(vfESI, get_offset(vf, &vf->vp[4])));
+   }
+
+   /* always load, needed or not:
+    */
+   sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
+   sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
+
+   /* Note address for loop jump */
+   label = x86_get_label(&p->func);
+
+   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+   while (j < vf->attr_count) {
+      struct draw_vf_attr *a = &vf->attr[j];
+      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+      /* Now, load an XMM reg from src, perhaps transform, then save.
+       * Could be shortcircuited in specific cases:
+       */
+      switch (a->format) {
+      case EMIT_1F:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 1, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_2F:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_3F:
+	 /* Potentially the worst case - hardcode 2+1 copying:
+	  */
+	 if (0) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 3, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 2, temp);
+	    if (a->inputsize > 2) {
+	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+	       emit_store(p, x86_make_disp(dest,8), 1, temp);
+	    }
+	    else {
+	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+	    }
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 break;
+      case EMIT_4F:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_2F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_3F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_3F_XYW:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+
+      case EMIT_1UB_1F:	 
+	 /* Test for PAD3 + 1UB:
+	  */
+	 if (j > 0 &&
+	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+	 {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    return FALSE;
+	 }
+	 break;
+      case EMIT_3UB_3F_RGB:
+      case EMIT_3UB_3F_BGR:
+	 /* Test for 3UB + PAD1:
+	  */
+	 if (j == vf->attr_count - 1 ||
+	     a[1].vertoffset >= a->vertoffset + 4) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    if (a->format == EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 /* Test for 3UB + 1UB:
+	  */
+	 else if (j < vf->attr_count - 1 &&
+		  a[1].format == EMIT_1UB_1F &&
+		  a[1].vertoffset == a->vertoffset + 3) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    update_src_ptr(p, srcECX, vfESI, a);
+
+	    /* Make room for incoming value:
+	     */
+	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+	    get_src_ptr(p, srcECX, vfESI, &a[1]);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    update_src_ptr(p, srcECX, vfESI, &a[1]);
+
+	    /* Rearrange and possibly do BGR conversion:
+	     */
+	    if (a->format == EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	    else
+	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+	    emit_pack_store_4ub(p, dest, temp);
+	    j++;		/* NOTE: two attrs consumed */
+	 }
+	 else {
+	    _mesa_printf("Can't emit 3ub\n");
+	 }
+	 return FALSE;	/* add this later */
+	 break;
+
+      case EMIT_4UB_4F_RGBA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4UB_4F_BGRA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4UB_4F_ARGB:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4UB_4F_ABGR:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4CHAN_4F_RGBA:
+	 switch (CHAN_TYPE) {
+	 case GL_UNSIGNED_BYTE:
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	    break;
+	 case GL_FLOAT:
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 4, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	    break;
+	 case GL_UNSIGNED_SHORT:
+	 default:
+	    _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
+	    return FALSE;
+	 }
+	 break;
+      default:
+	 _mesa_printf("unknown a[%d].format %d\n", j, a->format);
+	 return FALSE;	/* catch any new opcodes */
+      }
+      
+      /* Increment j by at least 1 - may have been incremented above also:
+       */
+      j++;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(&p->func, countEBP);
+   x86_test(&p->func, countEBP, countEBP); 
+   x86_jcc(&p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func.need_emms)
+      mmx_emms(&p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(&p->func, fixup);
+
+   /* Pop regs and return
+    */
+   x86_pop(&p->func, x86_get_base_reg(vfESI));
+   x86_pop(&p->func, countEBP);
+   x86_ret(&p->func);
+
+   vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
+   return TRUE;
+}
+
+
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   struct x86_program p;   
+
+   if (!cpu_has_xmm) {
+      vf->codegen_emit = NULL;
+      return;
+   }
+
+   _mesa_memset(&p, 0, sizeof(p));
+
+   p.vf = vf;
+   p.inputs_safe = 0;		/* for now */
+   p.outputs_safe = 1;		/* for now */
+   p.have_sse2 = cpu_has_xmm2;
+   p.identity = x86_make_reg(file_XMM, 6);
+   p.chan0 = x86_make_reg(file_XMM, 7);
+
+   x86_init_func(&p.func);
+
+   if (build_vertex_emit(&p)) {
+      draw_vf_register_fastpath( vf, TRUE );
+   }
+   else {
+      /* Note the failure so that we don't keep trying to codegen an
+       * impossible state:
+       */
+      draw_vf_register_fastpath( vf, FALSE );
+      x86_release_func(&p.func);
+   }
+}
+
+#else
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif
diff --git a/src/mesa/sources b/src/mesa/sources
index 97ef7e1936..e31d8cc466 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -175,6 +175,9 @@ DRAW_SOURCES = \
 	pipe/draw/draw_vertex_fetch.c \
 	pipe/draw/draw_vertex_shader.c \
 	pipe/draw/draw_vertex_shader_llvm.c \
+	pipe/draw/draw_vf.c \
+	pipe/draw/draw_vf_generic.c \
+	pipe/draw/draw_vf_sse.c \
 	pipe/draw/draw_wide_prims.c
 
 TGSIEXEC_SOURCES = \
-- 
cgit v1.2.3


From 169a74196fdca320cabd5cde33fda17683cc823d Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:46:21 +0900
Subject: First stab at hooking draw_vbuf & vf.

Emit disabled for now. Tested with softpipe. Only one vertex at a time for now (slow).
---
 src/mesa/pipe/draw/draw_vbuf.c       | 183 ++++++++++++++++++++++++++++++++++-
 src/mesa/pipe/draw/draw_vf.c         |  18 +++-
 src/mesa/pipe/draw/draw_vf.h         |  46 +++++----
 src/mesa/pipe/draw/draw_vf_generic.c |   2 +-
 src/mesa/pipe/draw/draw_vf_sse.c     |  38 ++++----
 5 files changed, 241 insertions(+), 46 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 1e260c6156..a3d0b5bca3 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -35,12 +35,15 @@
 
 
 #include <assert.h>
+#include <stddef.h>
 
-#include "pipe/draw/draw_vbuf.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
 #include "pipe/p_util.h"
 
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_vf.h"
+
 
 /**
  * Vertex buffer emit stage.
@@ -55,6 +58,8 @@ struct vbuf_stage {
    /** Vertex size in bytes */
    unsigned vertex_size;
 
+   struct draw_vertex_fetch *vf;
+   
    /* FIXME: we have no guarantee that 'unsigned' is 32bit */
 
    /** Vertices in hardware format */
@@ -121,6 +126,7 @@ static INLINE void
 emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
+#if 0
    const struct vertex_info *vinfo = vbuf->vinfo;
 
    uint i;
@@ -151,9 +157,11 @@ emit_vertex( struct vbuf_stage *vbuf,
       case EMIT_ALL:
          /* just copy the whole vertex as-is to the vbuf */
          assert(i == 0);
+         assert(j == 0);
          memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
          vbuf->vertex_ptr += vinfo->size;
-         return;
+         count += vinfo->size;
+         break;
       case EMIT_1F:
          *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
          count++;
@@ -192,6 +200,156 @@ emit_vertex( struct vbuf_stage *vbuf,
       }
    }
    assert(count == vinfo->size);
+#else
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+      
+   vertex->vertex_id = vbuf->nr_vertices++;
+
+   draw_vf_set_data(vbuf->vf, vertex->data);
+   draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
+
+   vbuf->vertex_ptr += vbuf->vertex_size/4;
+#endif
+}
+
+
+static void
+vbuf_set_vf_attributes(struct vbuf_stage *vbuf ) 
+{
+   const struct vertex_info *vinfo = vbuf->vinfo;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   uint i;
+   uint count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+//   fprintf(stderr, "emit vertex %d to %p\n", 
+//           vbuf->nr_vertices, vbuf->vertex_ptr);
+
+#if 0
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+#endif
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      uint j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned k, s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 /* FIXME */
+	 assert(0);
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vbuf->vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vbuf->vertex_size);
 }
 
 
@@ -269,6 +427,7 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
 
    vbuf->vinfo = vinfo;
    vbuf->vertex_size = vertex_size;
+   vbuf_set_vf_attributes(vbuf);
    
    if (!vbuf->vertices)
       vbuf_alloc_vertices(vbuf);
@@ -423,7 +582,12 @@ static void vbuf_destroy( struct draw_stage *stage )
 {
    struct vbuf_stage *vbuf = vbuf_stage( stage );
 
-   align_free( vbuf->indices );
+   if(vbuf->indices)
+      align_free( vbuf->indices );
+   
+   if(vbuf->vf)
+      draw_vf_destroy( vbuf->vf );
+
    FREE( stage );
 }
 
@@ -436,6 +600,9 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 {
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
 
+   if(!vbuf)
+      return NULL;
+   
    vbuf->stage.draw = draw;
    vbuf->stage.point = vbuf_first_point;
    vbuf->stage.line = vbuf_first_line;
@@ -450,11 +617,17 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    vbuf->max_indices = render->max_indices;
    vbuf->indices = (ushort *)
       align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
+   if(!vbuf->indices)
+      vbuf_destroy(&vbuf->stage);
    
    vbuf->vertices = NULL;
    vbuf->vertex_ptr = vbuf->vertices;
 
    vbuf->prim = ~0;
    
+   vbuf->vf = draw_vf_create(FALSE);
+   if(!vbuf->vf)
+      vbuf_destroy(&vbuf->stage);
+   
    return &vbuf->stage;
 }
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index f758460b5f..675974c6bc 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -162,7 +162,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
    for (j = 0, i = 0; i < nr; i++) {
       const unsigned format = map[i].format;
-      if (format == EMIT_PAD) {
+      if (format == DRAW_EMIT_PAD) {
 	 if (DBG)
 	    _mesa_printf("%d: pad %d, offset %d\n", i,  
 			 map[i].offset, offset);  
@@ -261,6 +261,22 @@ void draw_vf_set_sources( struct draw_vertex_fetch *vf,
 }
 
 
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_data( struct draw_vertex_fetch *vf,
+                       float data[][4])
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputstride = 0; /* XXX: one-vertex-max ATM */ 
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1]; 
+      a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
+   }
+}
+
 
 /* Emit count VB vertices to dest.  
  */
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 279570aad5..7619c0ee27 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -29,9 +29,11 @@
 #define DRAW_VF_H
 
 
-#include "pipe/p_compiler.h"
 #include "math/m_vector.h"
 
+#include "pipe/p_compiler.h"
+#include "draw_vertex.h"
+
 
 enum {
    DRAW_VF_ATTRIB_POS = 0,
@@ -67,24 +69,24 @@ enum {
 };
 
 enum draw_vf_attr_format {
-   EMIT_1F,
-   EMIT_2F,
-   EMIT_3F,
-   EMIT_4F,
-   EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
-   EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
-   EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
-   EMIT_3F_XYW,			/**< for projective texture */
-   EMIT_1UB_1F,			/**< for fog coordinate */
-   EMIT_3UB_3F_RGB,		/**< for specular color */
-   EMIT_3UB_3F_BGR,		/**< for specular color */
-   EMIT_4UB_4F_RGBA,		/**< for color */
-   EMIT_4UB_4F_BGRA,		/**< for color */
-   EMIT_4UB_4F_ARGB,		/**< for color */
-   EMIT_4UB_4F_ABGR,		/**< for color */
-   EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
-   EMIT_PAD,			/**< leave a hole of 'offset' bytes */
-   EMIT_MAX
+   DRAW_EMIT_1F,
+   DRAW_EMIT_2F,
+   DRAW_EMIT_3F,
+   DRAW_EMIT_4F,
+   DRAW_EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
+   DRAW_EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
+   DRAW_EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
+   DRAW_EMIT_3F_XYW,			/**< for projective texture */
+   DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
+   DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
+   DRAW_EMIT_3UB_3F_BGR,		/**< for specular color */
+   DRAW_EMIT_4UB_4F_RGBA,		/**< for color */
+   DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
+   DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
+   DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
+   DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   DRAW_EMIT_MAX
 };
 
 struct draw_vf_attr_map {
@@ -116,6 +118,10 @@ draw_vf_set_sources( struct draw_vertex_fetch *vf,
 		     GLvector4f * const attrib[],
 		     unsigned start ); 
 
+void 
+draw_vf_set_data( struct draw_vertex_fetch *vf,
+                  float data[][4]);
+
 void 
 draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
 		       unsigned count,
@@ -243,7 +249,7 @@ struct draw_vf_format_info {
    const unsigned attrsize;
 };
 
-const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX];
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX];
 
 
 #endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 19e6c587e5..42effc0c65 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -735,7 +735,7 @@ static void extract_1ub_1f( const struct draw_vf_attr *a, float *out, const uint
 }
 
 
-const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX] = 
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
 {
    { "1f",
      extract_1f,
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 2cf3a45ff9..a7019a47e6 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -398,19 +398,19 @@ static boolean build_vertex_emit( struct x86_program *p )
        * Could be shortcircuited in specific cases:
        */
       switch (a->format) {
-      case EMIT_1F:
+      case DRAW_EMIT_1F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 1, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_2F:
+      case DRAW_EMIT_2F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_3F:
+      case DRAW_EMIT_3F:
 	 /* Potentially the worst case - hardcode 2+1 copying:
 	  */
 	 if (0) {
@@ -433,13 +433,13 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 break;
-      case EMIT_4F:
+      case DRAW_EMIT_4F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_2F_VIEWPORT: 
+      case DRAW_EMIT_2F_VIEWPORT: 
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
@@ -447,7 +447,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_3F_VIEWPORT: 
+      case DRAW_EMIT_3F_VIEWPORT: 
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
@@ -455,7 +455,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 3, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4F_VIEWPORT: 
+      case DRAW_EMIT_4F_VIEWPORT: 
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
@@ -463,7 +463,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_3F_XYW:
+      case DRAW_EMIT_3F_XYW:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
@@ -471,7 +471,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
 
-      case EMIT_1UB_1F:	 
+      case DRAW_EMIT_1UB_1F:	 
 	 /* Test for PAD3 + 1UB:
 	  */
 	 if (j > 0 &&
@@ -488,15 +488,15 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    return FALSE;
 	 }
 	 break;
-      case EMIT_3UB_3F_RGB:
-      case EMIT_3UB_3F_BGR:
+      case DRAW_EMIT_3UB_3F_RGB:
+      case DRAW_EMIT_3UB_3F_BGR:
 	 /* Test for 3UB + PAD1:
 	  */
 	 if (j == vf->attr_count - 1 ||
 	     a[1].vertoffset >= a->vertoffset + 4) {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    if (a->format == EMIT_3UB_3F_BGR)
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
 	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 	    emit_pack_store_4ub(p, dest, temp);
 	    update_src_ptr(p, srcECX, vfESI, a);
@@ -504,7 +504,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 /* Test for 3UB + 1UB:
 	  */
 	 else if (j < vf->attr_count - 1 &&
-		  a[1].format == EMIT_1UB_1F &&
+		  a[1].format == DRAW_EMIT_1UB_1F &&
 		  a[1].vertoffset == a->vertoffset + 3) {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
@@ -520,7 +520,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 
 	    /* Rearrange and possibly do BGR conversion:
 	     */
-	    if (a->format == EMIT_3UB_3F_BGR)
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
 	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 	    else
 	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
@@ -534,34 +534,34 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 return FALSE;	/* add this later */
 	 break;
 
-      case EMIT_4UB_4F_RGBA:
+      case DRAW_EMIT_4UB_4F_RGBA:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4UB_4F_BGRA:
+      case DRAW_EMIT_4UB_4F_BGRA:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4UB_4F_ARGB:
+      case DRAW_EMIT_4UB_4F_ARGB:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4UB_4F_ABGR:
+      case DRAW_EMIT_4UB_4F_ABGR:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4CHAN_4F_RGBA:
+      case DRAW_EMIT_4CHAN_4F_RGBA:
 	 switch (CHAN_TYPE) {
 	 case GL_UNSIGNED_BYTE:
 	    get_src_ptr(p, srcECX, vfESI, a);
-- 
cgit v1.2.3


From 25d2ffc6697fcd60edc9596f778d8901083f7755 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:14:13 +0000
Subject: gallium: remove dead code from draw_vf*

---
 src/mesa/pipe/draw/Makefile          |   2 +
 src/mesa/pipe/draw/draw_vbuf.c       |   2 +-
 src/mesa/pipe/draw/draw_vf.c         |  90 +-------
 src/mesa/pipe/draw/draw_vf.h         |  18 +-
 src/mesa/pipe/draw/draw_vf_generic.c | 420 +----------------------------------
 src/mesa/pipe/draw/draw_vf_sse.c     |  51 -----
 6 files changed, 9 insertions(+), 574 deletions(-)
 create mode 100644 src/mesa/pipe/draw/Makefile

diff --git a/src/mesa/pipe/draw/Makefile b/src/mesa/pipe/draw/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/mesa/pipe/draw/Makefile
@@ -0,0 +1,2 @@
+default:
+	cd .. ; make
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index a3d0b5bca3..8ca225c65a 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -625,7 +625,7 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 
    vbuf->prim = ~0;
    
-   vbuf->vf = draw_vf_create(FALSE);
+   vbuf->vf = draw_vf_create();
    if(!vbuf->vf)
       vbuf_destroy(&vbuf->stage);
    
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 675974c6bc..deedfc7bc7 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -177,7 +177,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	 vf->attr[j].attrib = map[i].attrib;
 	 vf->attr[j].format = format;
 	 vf->attr[j].insert = draw_vf_format_info[format].insert;
-	 vf->attr[j].extract = draw_vf_format_info[format].extract;
 	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
 	 vf->attr[j].vertoffset = offset;
 	 
@@ -201,41 +200,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
 
-void draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
-		       const float *viewport )
-{
-   assert(vf->allow_viewport_emits);
-
-   /* scale */
-   vf->vp[0] = viewport[MAT_SX];
-   vf->vp[1] = viewport[MAT_SY];
-   vf->vp[2] = viewport[MAT_SZ];
-   vf->vp[3] = 1.0;
-
-   /* translate */
-   vf->vp[4] = viewport[MAT_TX];
-   vf->vp[5] = viewport[MAT_TY];
-   vf->vp[6] = viewport[MAT_TZ];
-   vf->vp[7] = 0.0;
-}
-
-void draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
-				const float *scale,
-				const float *translate )
-{
-   assert(vf->allow_viewport_emits);
-
-   vf->vp[0] = scale[0];
-   vf->vp[1] = scale[1];
-   vf->vp[2] = scale[2];
-   vf->vp[3] = scale[3];
-
-   vf->vp[4] = translate[0];
-   vf->vp[5] = translate[1];
-   vf->vp[6] = translate[2];
-   vf->vp[7] = translate[3];
-}
-
 
 /* Set attribute pointers, adjusted for start position:
  */
@@ -288,39 +252,10 @@ void draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
 }
 
 
-/* Extract a named attribute from a hardware vertex.  Will have to
- * reverse any viewport transformation, swizzling or other conversions
- * which may have been applied.
- *
- * This is mainly required for on-the-fly vertex translations to
- * swrast format.
- */
-void draw_vf_get_attr( struct draw_vertex_fetch *vf,
-		  const void *vertex,
-		  GLenum attr, 
-		  const float *dflt,
-		  float *dest )
-{
-   const struct draw_vf_attr *a = vf->attr;
-   const unsigned attr_count = vf->attr_count;
-   unsigned j;
-
-   for (j = 0; j < attr_count; j++) {
-      if (a[j].attrib == attr) {
-	 a[j].extract( &a[j], dest, (uint8_t *)vertex + a[j].vertoffset );
-	 return;
-      }
-   }
-
-   /* Else return the value from ctx->Current.
-    */
-   _mesa_memcpy( dest, dflt, 4*sizeof(float));
-}
-
 
 
-struct draw_vertex_fetch *draw_vf_create( boolean allow_viewport_emits )
+struct draw_vertex_fetch *draw_vf_create( void )
 {
    struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
    unsigned i;
@@ -328,29 +263,6 @@ struct draw_vertex_fetch *draw_vf_create( boolean allow_viewport_emits )
    for (i = 0; i < DRAW_VF_ATTRIB_MAX; i++)
       vf->attr[i].vf = vf;
 
-   vf->allow_viewport_emits = allow_viewport_emits;
-
-   switch(CHAN_TYPE) {
-   case GL_UNSIGNED_BYTE:
-      vf->chan_scale[0] = 255.0;
-      vf->chan_scale[1] = 255.0;
-      vf->chan_scale[2] = 255.0;
-      vf->chan_scale[3] = 255.0;
-      break;
-   case GL_UNSIGNED_SHORT:
-      vf->chan_scale[0] = 65535.0;
-      vf->chan_scale[1] = 65535.0;
-      vf->chan_scale[2] = 65535.0;
-      vf->chan_scale[3] = 65535.0;
-      break;
-   default:
-      vf->chan_scale[0] = 1.0;
-      vf->chan_scale[1] = 1.0;
-      vf->chan_scale[2] = 1.0;
-      vf->chan_scale[3] = 1.0;
-      break;
-   }
-
    vf->identity[0] = 0.0;
    vf->identity[1] = 0.0;
    vf->identity[2] = 0.0;
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 7619c0ee27..c6a8fe0d53 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -73,9 +73,6 @@ enum draw_vf_attr_format {
    DRAW_EMIT_2F,
    DRAW_EMIT_3F,
    DRAW_EMIT_4F,
-   DRAW_EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
-   DRAW_EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
-   DRAW_EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
    DRAW_EMIT_3F_XYW,			/**< for projective texture */
    DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
    DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
@@ -84,7 +81,6 @@ enum draw_vf_attr_format {
    DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
    DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
    DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
-   DRAW_EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
    DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
    DRAW_EMIT_MAX
 };
@@ -98,14 +94,6 @@ struct draw_vf_attr_map {
 struct draw_vertex_fetch;
 
 
-void 
-draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
-                       const float *viewport );
-
-void 
-draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
-				const float *scale,
-				const float *translate );
 
 unsigned 
 draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
@@ -135,7 +123,7 @@ draw_vf_get_attr( struct draw_vertex_fetch *vf,
 		  float *dest );
 
 struct draw_vertex_fetch *
-draw_vf_create( boolean allow_viewport_emits );
+draw_vf_create( void );
 
 void 
 draw_vf_destroy( struct draw_vertex_fetch *vf );
@@ -196,9 +184,6 @@ struct draw_vertex_fetch
 
    /* Parameters and constants for codegen:
     */
-   boolean allow_viewport_emits;
-   float vp[8];		
-   float chan_scale[4];
    float identity[4];
 
    struct draw_vf_fastpath *fastpath;
@@ -244,7 +229,6 @@ draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
 
 struct draw_vf_format_info {
    const char *name;
-   draw_vf_extract_func extract;
    draw_vf_insert_func insert[4];
    const unsigned attrsize;
 };
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 42effc0c65..343428d26c 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -36,125 +36,6 @@
 #include "draw_vf.h"
 
 
-/*
- * These functions take the NDC coordinates pointed to by 'in', apply the
- * NDC->Viewport mapping and store the results at 'v'.
- */
-
-static INLINE void insert_4f_viewport_4( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-   out[3] = in[3];
-}
-
-static INLINE void insert_4f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] =                    trans[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] =                    trans[1];
-   out[2] =                    trans[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_3f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-}
-
-static INLINE void insert_3f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-}
-
-static INLINE void insert_3f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] =                    trans[1];
-   out[2] =                    trans[2];
-}
-
-static INLINE void insert_2f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-}
-
-static INLINE void insert_2f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = trans[1];
-}
-
-
-/*
- * These functions do the same as above, except for the viewport mapping.
- */
 
 static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
 {
@@ -278,50 +159,6 @@ static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const
    (void) a; (void) v; (void) in;
 }
 
-static INLINE void insert_4chan_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[3], in[3]);
-}
-
-static INLINE void insert_4chan_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
-   c[3] = CHAN_MAX;
-}
-
-static INLINE void insert_4chan_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   c[2] = 0;
-   c[3] = CHAN_MAX;
-}
-
-static INLINE void insert_4chan_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   c[1] = 0;
-   c[2] = 0;
-   c[3] = CHAN_MAX;
-}
-
 static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
 					 const float *in )
 {
@@ -545,291 +382,64 @@ static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v,
 }
 
 
-/***********************************************************************
- * Functions to perform the reverse operations to the above, for
- * swrast translation and clip-interpolation.
- * 
- * Currently always extracts a full 4 floats.
- */
-
-static void extract_4f_viewport( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   /* Although included for completeness, the position coordinate is
-    * usually handled differently during clipping.
-    */
-   out[0] = (in[0] - trans[0]) / scale[0];
-   out[1] = (in[1] - trans[1]) / scale[1];
-   out[2] = (in[2] - trans[2]) / scale[2];
-   out[3] = in[3];
-}
-
-static void extract_3f_viewport( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = (in[0] - trans[0]) / scale[0];
-   out[1] = (in[1] - trans[1]) / scale[1];
-   out[2] = (in[2] - trans[2]) / scale[2];
-   out[3] = 1;
-}
-
-
-static void extract_2f_viewport( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = (in[0] - trans[0]) / scale[0];
-   out[1] = (in[1] - trans[1]) / scale[1];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-
-static void extract_4f( const struct draw_vf_attr *a, float *out, const uint8_t *v  )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = in[3];
-}
-
-static void extract_3f_xyw( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = in[2];
-}
-
-
-static void extract_3f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = 1;
-}
-
-
-static void extract_2f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static void extract_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static void extract_4chan_4f_rgba( const struct draw_vf_attr *a, float *out, 
-				   const uint8_t *v )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-
-   out[0] = CHAN_TO_FLOAT(c[0]);
-   out[1] = CHAN_TO_FLOAT(c[1]);
-   out[2] = CHAN_TO_FLOAT(c[2]);
-   out[3] = CHAN_TO_FLOAT(c[3]);
-}
-
-static void extract_4ub_4f_rgba( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[2] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_bgra( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[2] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[0] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_argb( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[3] = UBYTE_TO_FLOAT(v[0]);
-   out[0] = UBYTE_TO_FLOAT(v[1]);
-   out[1] = UBYTE_TO_FLOAT(v[2]);
-   out[2] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_abgr( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[3] = UBYTE_TO_FLOAT(v[0]);
-   out[2] = UBYTE_TO_FLOAT(v[1]);
-   out[1] = UBYTE_TO_FLOAT(v[2]);
-   out[0] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_3ub_3f_rgb( const struct draw_vf_attr *a, float *out, 
-				const uint8_t *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[2] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = 1;
-}
-
-static void extract_3ub_3f_bgr( const struct draw_vf_attr *a, float *out, 
-				const uint8_t *v )
-{
-   (void) a;
-   out[2] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[0] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = 1;
-}
-
-static void extract_1ub_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-
 const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
 {
    { "1f",
-     extract_1f,
      { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
      sizeof(float) },
 
    { "2f",
-     extract_2f,
      { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
      2 * sizeof(float) },
 
    { "3f",
-     extract_3f,
      { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
      3 * sizeof(float) },
 
    { "4f",
-     extract_4f,
      { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
      4 * sizeof(float) },
 
-   { "2f_viewport",
-     extract_2f_viewport,
-     { insert_2f_viewport_1, insert_2f_viewport_2, insert_2f_viewport_2,
-       insert_2f_viewport_2 },
-     2 * sizeof(float) },
-
-   { "3f_viewport",
-     extract_3f_viewport,
-     { insert_3f_viewport_1, insert_3f_viewport_2, insert_3f_viewport_3,
-       insert_3f_viewport_3 },
-     3 * sizeof(float) },
-
-   { "4f_viewport",
-     extract_4f_viewport,
-     { insert_4f_viewport_1, insert_4f_viewport_2, insert_4f_viewport_3,
-       insert_4f_viewport_4 }, 
-     4 * sizeof(float) },
-
    { "3f_xyw",
-     extract_3f_xyw,
      { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
        insert_3f_xyw_4 },
      3 * sizeof(float) },
 
    { "1ub_1f",
-     extract_1ub_1f,
      { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
      sizeof(uint8_t) },
 
    { "3ub_3f_rgb",
-     extract_3ub_3f_rgb,
      { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
        insert_3ub_3f_rgb_3 },
      3 * sizeof(uint8_t) },
 
    { "3ub_3f_bgr",
-     extract_3ub_3f_bgr,
      { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
        insert_3ub_3f_bgr_3 },
      3 * sizeof(uint8_t) },
 
    { "4ub_4f_rgba",
-     extract_4ub_4f_rgba,
      { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
        insert_4ub_4f_rgba_4 },
      4 * sizeof(uint8_t) },
 
    { "4ub_4f_bgra",
-     extract_4ub_4f_bgra,
      { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
        insert_4ub_4f_bgra_4 },
      4 * sizeof(uint8_t) },
 
    { "4ub_4f_argb",
-     extract_4ub_4f_argb,
      { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
        insert_4ub_4f_argb_4 },
      4 * sizeof(uint8_t) },
 
    { "4ub_4f_abgr",
-     extract_4ub_4f_abgr,
      { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
        insert_4ub_4f_abgr_4 },
      4 * sizeof(uint8_t) },
 
-   { "4chan_4f_rgba",
-     extract_4chan_4f_rgba,
-     { insert_4chan_4f_rgba_1, insert_4chan_4f_rgba_2, insert_4chan_4f_rgba_3,
-       insert_4chan_4f_rgba_4 },
-     4 * sizeof(GLchan) },
-
    { "pad",
-     NULL,
      { NULL, NULL, NULL, NULL },
      0 }
 
@@ -889,16 +499,10 @@ static void NAME( struct draw_vertex_fetch *vf,				\
 				          insert_null, NAME)
    
 
-EMIT2(insert_3f_viewport_3, insert_4ub_4f_rgba_4, emit_viewport3_rgba4)
-EMIT2(insert_3f_viewport_3, insert_4ub_4f_bgra_4, emit_viewport3_bgra4)
 EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
 
-EMIT3(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_viewport4_rgba4_st2)
-EMIT3(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2,  emit_viewport4_bgra4_st2)
 EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
 
-EMIT4(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_viewport4_rgba4_st2_st2)
-EMIT4(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2, insert_2f_2,  emit_viewport4_bgra4_st2_st2)
 EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
 
 
@@ -914,42 +518,26 @@ void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
     */
    switch (vf->attr_count) {
    case 2:
-      if (vf->attr[0].do_insert == insert_3f_viewport_3) {
-	 if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4) 
-	    func = emit_viewport3_bgra4;
-	 else if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) 
-	    func = emit_viewport3_rgba4;
-      }
-      else if (vf->attr[0].do_insert == insert_3f_3 &&
-	       vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+      if (vf->attr[0].do_insert == insert_3f_3 &&
+	  vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
  	 func = emit_xyz3_rgba4; 
       }
       break;
    case 3:
       if (vf->attr[2].do_insert == insert_2f_2) {
 	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
-	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
-	       func = emit_viewport4_rgba4_st2;
-	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	    if (vf->attr[0].do_insert == insert_4f_4) 
 	       func = emit_xyzw4_rgba4_st2;
 	 }
-	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
-		  vf->attr[0].do_insert == insert_4f_viewport_4)
-	    func = emit_viewport4_bgra4_st2;
       }
       break;
    case 4:
       if (vf->attr[2].do_insert == insert_2f_2 &&
 	  vf->attr[3].do_insert == insert_2f_2) {
 	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
-	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
-	       func = emit_viewport4_rgba4_st2_st2;
-	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	    if (vf->attr[0].do_insert == insert_4f_4) 
 	       func = emit_xyzw4_rgba4_st2_st2;
 	 }
-	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
-		  vf->attr[0].do_insert == insert_4f_viewport_4)
-	    func = emit_viewport4_bgra4_st2_st2;
       }
       break;
    }
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index a7019a47e6..b238b542e7 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -369,13 +369,6 @@ static boolean build_vertex_emit( struct x86_program *p )
    x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
 
    
-   /* Possibly load vp0, vp1 for viewport calcs:
-    */
-   if (vf->allow_viewport_emits) {
-      sse_movups(&p->func, vp0, x86_make_disp(vfESI, get_offset(vf, &vf->vp[0])));
-      sse_movups(&p->func, vp1, x86_make_disp(vfESI, get_offset(vf, &vf->vp[4])));
-   }
-
    /* always load, needed or not:
     */
    sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
@@ -439,30 +432,6 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case DRAW_EMIT_2F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 2, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_3F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 3, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_4F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 4, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
       case DRAW_EMIT_3F_XYW:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
@@ -561,26 +530,6 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case DRAW_EMIT_4CHAN_4F_RGBA:
-	 switch (CHAN_TYPE) {
-	 case GL_UNSIGNED_BYTE:
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	    emit_pack_store_4ub(p, dest, temp);
-	    update_src_ptr(p, srcECX, vfESI, a);
-	    break;
-	 case GL_FLOAT:
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	    emit_store(p, dest, 4, temp);
-	    update_src_ptr(p, srcECX, vfESI, a);
-	    break;
-	 case GL_UNSIGNED_SHORT:
-	 default:
-	    _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
-	    return FALSE;
-	 }
-	 break;
       default:
 	 _mesa_printf("unknown a[%d].format %d\n", j, a->format);
 	 return FALSE;	/* catch any new opcodes */
-- 
cgit v1.2.3


From 01ab6472cce1a5ff0186eb606ed3077d9008a53f Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:46:52 +0000
Subject: gallium: fill in missing formats for vertex_fetch

---
 src/mesa/pipe/draw/draw_vertex_fetch.c | 220 ++++++++++++++++++++++++++++++---
 1 file changed, 203 insertions(+), 17 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index fb64723a19..0789dc8e8c 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -62,50 +62,236 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    }							\
 }
 
+#define CVT_64_FLOAT   ((double *) ptr)[i]
 #define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
 #define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
 #define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
 
 FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
 FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
 FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
 FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
 FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
 
 
 static fetch_func get_fetch_func( enum pipe_format format )
 {
    switch (format) {
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
    case PIPE_FORMAT_R32_FLOAT:
       return fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
    case PIPE_FORMAT_R32_SSCALED:
       return fetch_R32_SSCALED;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
    case 0:
-      return NULL;
+      return NULL;		/* not sure why this is needed */
+
    default:
-      /* Lots of missing cases! */
       assert(0);
       return NULL;
    }
-- 
cgit v1.2.3


From 85d7e7ceeecde86621e3d999c475c1e9d97091f0 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:48:22 +0000
Subject: gallium: explictly cast double to float in vertex fetch

---
 src/mesa/pipe/draw/draw_vertex_fetch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 0789dc8e8c..af3983b7f0 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -62,7 +62,7 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    }							\
 }
 
-#define CVT_64_FLOAT   ((double *) ptr)[i]
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
 #define CVT_32_FLOAT   ((float *) ptr)[i]
 
 #define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
-- 
cgit v1.2.3


From f7e64c323fe6a646ee60c55ba2552923a7670c53 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:12:11 +0000
Subject: gallium: only call vertex/prim queue flush when there is something to
 flush

---
 src/mesa/pipe/draw/draw_prim.c          |  9 +++++----
 src/mesa/pipe/draw/draw_vertex_fetch.c  | 10 ++++++++++
 src/mesa/pipe/draw/draw_vertex_shader.c |  2 ++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 243381aec0..2a612a1673 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -63,8 +63,7 @@ static void draw_prim_queue_flush( struct draw_context *draw )
       fprintf(stdout,"Flushing with %d prims, %d verts\n",
              draw->pq.queue_nr, draw->vs.queue_nr);
 
-   if (draw->pq.queue_nr == 0)
-      return;
+   assert (draw->pq.queue_nr != 0);
 
    /* NOTE: we cannot save draw->pipeline->first in a local var because
     * draw->pipeline->first is often changed by the first call to tri(),
@@ -109,10 +108,12 @@ void draw_do_flush( struct draw_context *draw, unsigned flags )
 
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
-      draw_vertex_shader_queue_flush(draw);
+      if (draw->vs.queue_nr)
+	 draw_vertex_shader_queue_flush(draw);
 
       if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
-         draw_prim_queue_flush(draw);
+	 if (draw->pq.queue_nr)
+	    draw_prim_queue_flush(draw);
 
 	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
             draw_vertex_cache_invalidate(draw);
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index af3983b7f0..143acdd3b4 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -158,6 +158,14 @@ FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
 
 static fetch_func get_fetch_func( enum pipe_format format )
 {
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
    switch (format) {
    case PIPE_FORMAT_R64_FLOAT:
       return fetch_R64_FLOAT;
@@ -317,6 +325,8 @@ void draw_update_vertex_fetch( struct draw_context *draw )
 {
    unsigned nr_attrs, i;
 
+//   _mesa_printf("%s\n", __FUNCTION__);
+   
    /* this may happend during context init */
    if (!draw->vertex_shader)
       return;
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 3041974b9a..289c35c7ae 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -201,6 +201,8 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
    unsigned i, j;
 
+   assert(draw->vs.queue_nr != 0);
+
    /* XXX: do this on statechange: 
     */
    draw_update_vertex_fetch( draw );
-- 
cgit v1.2.3


From bb37e7f5917dba3f2ad84ecf0b6c95bf58205faf Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 12:40:29 +0000
Subject: gallium: add a couple of hardwired vertex fetch functions

---
 src/mesa/pipe/draw/draw_private.h       |   9 +-
 src/mesa/pipe/draw/draw_vertex_fetch.c  | 150 +++++++++++++++++++++++++++-----
 src/mesa/pipe/draw/draw_vertex_shader.c |  12 ++-
 3 files changed, 141 insertions(+), 30 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 1e59f5bd8d..21de400676 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -141,6 +141,10 @@ struct draw_vertex_shader {
 /* Internal function for vertex fetch.
  */
 typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*full_fetch_func)( struct draw_context *draw,
+				 struct tgsi_exec_machine *machine,
+				 const unsigned *elts,
+				 unsigned count );
 
 
@@ -210,6 +214,7 @@ struct draw_context
       unsigned pitch[PIPE_ATTRIB_MAX];
       fetch_func fetch[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
+      full_fetch_func fetch_func;
    } vertex_fetch;
 
    /* Post-tnl vertex cache:
@@ -287,10 +292,6 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
 struct tgsi_exec_machine;
 
 extern void draw_update_vertex_fetch( struct draw_context *draw );
-extern void draw_vertex_fetch( struct draw_context *draw,
-			       struct tgsi_exec_machine *machine,
-			       const unsigned *elts,
-			       unsigned count );
 
 
 #define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 143acdd3b4..afdf1971d2 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -320,42 +320,101 @@ transpose_4x4( float *out, const float *in )
 }
 
 
-			       
-void draw_update_vertex_fetch( struct draw_context *draw )
+
+static void fetch_xyz_rgb( struct draw_context *draw,
+			   struct tgsi_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
 {
-   unsigned nr_attrs, i;
+   assert(count <= 4);
 
 //   _mesa_printf("%s\n", __FUNCTION__);
-   
-   /* this may happend during context init */
-   if (!draw->vertex_shader)
-      return;
 
-   nr_attrs = draw->vertex_shader->state->num_inputs;
+   /* loop over vertex attributes (vertex shader inputs)
+    */
 
-   for (i = 0; i < nr_attrs; i++) {
-      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
-      enum pipe_format format  = draw->vertex_element[i].src_format;
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
 
-      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
-						       draw->vertex_buffer[buf].buffer_offset + 
-						       draw->vertex_element[i].src_offset;
 
-      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
-      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
-   }
 
-   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+			      struct tgsi_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
 }
 
 
+
+
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-void draw_vertex_fetch( struct draw_context *draw,
-			struct tgsi_exec_machine *machine,
-			const unsigned *elts,
-			unsigned count )
+static void generic_vertex_fetch( struct draw_context *draw,
+				  struct tgsi_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
@@ -402,3 +461,50 @@ void draw_vertex_fetch( struct draw_context *draw,
    }
 }
 
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   unsigned nr_attrs, i;
+
+//   _mesa_printf("%s\n", __FUNCTION__);
+   
+   /* this may happend during context init */
+   if (!draw->vertex_shader)
+      return;
+
+   nr_attrs = draw->vertex_shader->state->num_inputs;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      enum pipe_format format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset;
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+   }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (nr_attrs) {
+   case 2:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+
+}
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 289c35c7ae..0806e23d6c 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -110,7 +110,7 @@ run_vertex_program(struct draw_context *draw,
    machine->Inputs = ALIGN16_ASSIGN(inputs);
    machine->Outputs = ALIGN16_ASSIGN(outputs);
 
-   draw_vertex_fetch( draw, machine, elts, count );
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
 
    /* run shader */
 #if defined(__i386__) || defined(__386__)
@@ -219,14 +219,18 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
    for (i = 0; i < draw->vs.queue_nr; i += 4) {
       struct vertex_header *dests[4];
       unsigned elts[4];
-      int n;
+      int n = MIN2(4, draw->vs.queue_nr - i);
 
-      for (j = 0; j < 4; j++) {
+      for (j = 0; j < n; j++) {
          elts[j] = draw->vs.queue[i + j].elt;
          dests[j] = draw->vs.queue[i + j].dest;
       }
 
-      n = MIN2(4, draw->vs.queue_nr - i);
+      for ( ; j < 4; j++) {
+	 elts[j] = elts[0];
+	 dests[j] = dests[0];
+      }
+
       assert(n > 0);
       assert(n <= 4);
 
-- 
cgit v1.2.3


From af2ccd4c0c58e6565c2c6c6f9464db2cf4e0baab Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 15:50:02 +0000
Subject: gallium: handle flatshading explicitly in clipper stage

We can do a better job in the clip stage than just relying on the
brute-force approach of copying colors to all incoming vertices applied
in the flatshade stage.

At very least, it is only necessary to do this in the clipper when a
primitive is actually being clipped.
---
 src/mesa/pipe/draw/draw_clip.c     | 136 +++++++++++++++++++++++++------------
 src/mesa/pipe/draw/draw_validate.c |  10 +--
 2 files changed, 99 insertions(+), 47 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_clip.c b/src/mesa/pipe/draw/draw_clip.c
index 2d410e3244..da20028904 100644
--- a/src/mesa/pipe/draw/draw_clip.c
+++ b/src/mesa/pipe/draw/draw_clip.c
@@ -33,6 +33,8 @@
 
 
 #include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
 #include "draw_context.h"
 #include "draw_private.h"
 
@@ -54,6 +56,12 @@
 struct clipper {
    struct draw_stage stage;      /**< base class */
 
+   /* Basically duplicate some of the flatshading logic here:
+    */
+   boolean flat;
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+
    float (*plane)[4];
 };
 
@@ -82,6 +90,17 @@ static void interp_attr( float *fdst,
    fdst[3] = LINTERP( t, fout[3], fin[3] );
 }
 
+static void copy_colors( struct draw_stage *stage,
+			 struct vertex_header *dst,
+			 const struct vertex_header *src )
+{
+   const struct clipper *clipper = clipper_stage(stage);
+   uint i;
+   for (i = 0; i < clipper->num_color_attribs; i++) {
+      const uint attr = clipper->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
 
 
@@ -134,27 +153,11 @@ static void interp( const struct clipper *clip,
    }
 }
 
-#if 0   
-static INLINE void do_tri( struct draw_stage *next,
-			   struct prim_header *header )
-{
-   unsigned i;
-   for (i = 0; i < 3; i++) {
-      float *ndc = header->v[i]->data[0];
-      _mesa_printf("ndc %f %f %f\n", ndc[0], ndc[1], ndc[2]);
-      assert(ndc[0] >= -1 && ndc[0] <= 641);
-      assert(ndc[1] >= 30 && ndc[1] <= 481);
-   }
-   _mesa_printf("\n");
-   next->tri(next, header);
-}
-#endif
-
 
 static void emit_poly( struct draw_stage *stage,
 		       struct vertex_header **inlist,
 		       unsigned n,
-                       const struct prim_header *origPrim)
+		       const struct prim_header *origPrim)
 {
    struct prim_header header;
    unsigned i;
@@ -163,16 +166,16 @@ static void emit_poly( struct draw_stage *stage,
    header.det = origPrim->det;
 
    for (i = 2; i < n; i++) {
-      header.v[0] = inlist[0];
-      header.v[1] = inlist[i-1];
-      header.v[2] = inlist[i];
+      header.v[0] = inlist[i-1];
+      header.v[1] = inlist[i];
+      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
 	
       {
-	 unsigned tmp0 = header.v[0]->edgeflag;
+	 unsigned tmp1 = header.v[1]->edgeflag;
 	 unsigned tmp2 = header.v[2]->edgeflag;
 
-	 if (i != 2)   header.v[0]->edgeflag = 0;
-	 if (i != n-1) header.v[2]->edgeflag = 0;
+	 if (i != n-1) header.v[1]->edgeflag = 0;
+	 if (i != 2)   header.v[2]->edgeflag = 0;
 
          header.edgeflags = ((header.v[0]->edgeflag << 0) | 
                              (header.v[1]->edgeflag << 1) | 
@@ -180,27 +183,13 @@ static void emit_poly( struct draw_stage *stage,
 
 	 stage->next->tri( stage->next, &header );
 
-	 header.v[0]->edgeflag = tmp0;
+	 header.v[1]->edgeflag = tmp1;
 	 header.v[2]->edgeflag = tmp2;
       }
    }
 }
 
 
-#if 0
-static void emit_poly( struct draw_stage *stage )
-{
-   unsigned i;
-
-   for (i = 2; i < n; i++) {
-      header->v[0] = inlist[0];
-      header->v[1] = inlist[i-1];
-      header->v[2] = inlist[i];
-	 
-      stage->next->tri( stage->next, header );
-   }
-}
-#endif
 
 
 /* Clip a triangle against the viewport and user clip planes.
@@ -281,6 +270,18 @@ do_clip_tri( struct draw_stage *stage,
       }
    }
 
+   /* If flat-shading, copy color to new provoking vertex.
+    */
+   if (clipper->flat && inlist[0] != header->v[2]) {
+      if (1) {
+	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+      }
+
+      copy_colors(stage, inlist[0], header->v[2]);
+   }
+
+
+
    /* Emit the polygon as triangles to the setup stage:
     */
    if (n >= 3)
@@ -328,6 +329,10 @@ do_clip_line( struct draw_stage *stage,
 
    if (v0->clipmask) {
       interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+      if (clipper->flat)
+	 copy_colors(stage, stage->tmp[0], v0);
+
       newprim.v[0] = stage->tmp[0];
    }
    else {
@@ -393,8 +398,55 @@ clip_tri( struct draw_stage *stage,
    }
 }
 
-static void clip_flush( struct draw_stage *stage, unsigned flags )
+/* Update state.  Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void 
+clip_init_state( struct draw_stage *stage )
+{
+   struct clipper *clipper = clipper_stage( stage );
+
+   clipper->flat = stage->draw->rasterizer->flatshade;
+
+   if (clipper->flat) {
+      const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+      uint i;
+
+      clipper->num_color_attribs = 0;
+      for (i = 0; i < vs->num_outputs; i++) {
+	 if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+	     vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+	    clipper->color_attribs[clipper->num_color_attribs++] = i;
+	 }
+      }
+   }
+   
+   stage->tri = clip_tri;
+   stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage, 
+			     unsigned flags )
 {
+   stage->tri = clip_first_tri;
+   stage->line = clip_first_line;
    stage->next->flush( stage->next, flags );
 }
 
@@ -420,12 +472,12 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
 {
    struct clipper *clipper = CALLOC_STRUCT(clipper);
 
-   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES );
+   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
 
    clipper->stage.draw = draw;
    clipper->stage.point = clip_point;
-   clipper->stage.line = clip_line;
-   clipper->stage.tri = clip_tri;
+   clipper->stage.line = clip_first_line;
+   clipper->stage.tri = clip_first_tri;
    clipper->stage.flush = clip_flush;
    clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
    clipper->stage.destroy = clip_destroy;
diff --git a/src/mesa/pipe/draw/draw_validate.c b/src/mesa/pipe/draw/draw_validate.c
index 86d5a5f814..4375ebabbc 100644
--- a/src/mesa/pipe/draw/draw_validate.c
+++ b/src/mesa/pipe/draw/draw_validate.c
@@ -78,6 +78,11 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       precalc_flat = 1;		/* only needed for triangles really */
       need_det = 1;
    }
+
+   if (draw->rasterizer->flatshade && precalc_flat) {
+      draw->pipeline.flatshade->next = next;
+      next = draw->pipeline.flatshade;
+   }
 	 
    if (draw->rasterizer->offset_cw ||
        draw->rasterizer->offset_ccw) {
@@ -110,13 +115,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
-      precalc_flat = 1;		/* XXX: FIX ME! Only needed for clipped prims */
    }
 
-   if (draw->rasterizer->flatshade && precalc_flat) {
-      draw->pipeline.flatshade->next = next;
-      next = draw->pipeline.flatshade;
-   }
    
    draw->pipeline.first = next;
    return next;
-- 
cgit v1.2.3


From 72b671bd3986cc655fbe5df76349bc0989b1c083 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:23:29 -0700
Subject: gallium: check if surface has defined status in
 check_clear_depth_with_quad()

This was part of Keith's patch from Friday.
---
 src/mesa/state_tracker/st_cb_clear.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 758d4a4086..0cd469c156 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -408,7 +408,9 @@ check_clear_depth_with_quad(GLcontext *ctx, struct gl_renderbuffer *rb)
    const struct st_renderbuffer *strb = st_renderbuffer(rb);
    const GLboolean isDS = is_depth_stencil_format(strb->surface->format);
    return  ctx->Scissor.Enabled
-      || (isDS && ctx->DrawBuffer->Visual.stencilBits > 0);
+      || (isDS && 
+	  strb->surface->status == PIPE_SURFACE_STATUS_DEFINED &&
+	  ctx->DrawBuffer->Visual.stencilBits > 0);
 }
 
 
-- 
cgit v1.2.3


From 5c7c0675a70b32f159e3a972279535554aa7f4d9 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Sun, 27 Jan 2008 12:01:47 -0700
Subject: Cell: generalize the batch buffer code for vertex buffers...

---
 src/mesa/pipe/cell/common.h           |  8 ++--
 src/mesa/pipe/cell/ppu/cell_batch.c   | 84 ++++++++++++++++++++---------------
 src/mesa/pipe/cell/ppu/cell_batch.h   |  3 ++
 src/mesa/pipe/cell/ppu/cell_context.c |  5 ++-
 src/mesa/pipe/cell/ppu/cell_context.h | 10 +++--
 src/mesa/pipe/cell/ppu/cell_spu.c     |  4 +-
 src/mesa/pipe/cell/spu/spu_main.c     | 22 ++++-----
 7 files changed, 79 insertions(+), 57 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 0b63ed39be..ce9c381907 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -81,8 +81,8 @@
 #define CELL_CMD_STATE_VERTEX_INFO   13
 
 
-#define CELL_NUM_BATCH_BUFFERS 3
-#define CELL_BATCH_BUFFER_SIZE 1024  /**< 16KB would be the max */
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
@@ -147,7 +147,9 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    struct cell_command *cmd;
-   ubyte *batch_buffers[CELL_NUM_BATCH_BUFFERS];
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index c894ef8608..178caa74e1 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -31,12 +31,46 @@
 #include "cell_spu.h"
 
 
+
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   uint buf = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+   }
+}
+
+
+
 void
 cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->batch_buffer_size[batch];
+   const uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -46,7 +80,7 @@ cell_batch_flush(struct cell_context *cell)
 
    flushing = TRUE;
 
-   assert(batch < CELL_NUM_BATCH_BUFFERS);
+   assert(batch < CELL_NUM_BUFFERS);
 
    /*
    printf("cell_batch_dispatch: buf %u at %p, size %u\n",
@@ -68,28 +102,9 @@ cell_batch_flush(struct cell_context *cell)
     * array indicating that the PPU can re-use the buffer.
     */
 
+   batch = cell_get_empty_buffer(cell);
 
-   /* Find a buffer that's marked as free by all SPUs */
-   while (1) {
-      uint num_free = 0;
-
-      batch = (batch + 1) % CELL_NUM_BATCH_BUFFERS;
-
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         if (cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_FREE)
-            num_free++;
-      }
-
-      if (num_free == cell->num_spus) {
-         /* found a free buffer, now mark status as used */
-         for (spu = 0; spu < cell->num_spus; spu++) {
-            cell->buffer_status[spu][batch][0] = CELL_BUFFER_STATUS_USED;
-         }
-         break;
-      }
-   }
-
-   cell->batch_buffer_size[batch] = 0;  /* empty */
+   cell->buffer_size[batch] = 0;  /* empty */
    cell->cur_batch = batch;
 
    flushing = FALSE;
@@ -99,8 +114,7 @@ cell_batch_flush(struct cell_context *cell)
 uint
 cell_batch_free_space(const struct cell_context *cell)
 {
-   uint free = CELL_BATCH_BUFFER_SIZE
-      - cell->batch_buffer_size[cell->cur_batch];
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
    return free;
 }
 
@@ -117,18 +131,18 @@ cell_batch_append(struct cell_context *cell, const void *cmd, uint length)
    assert(length % 4 == 0);
    assert(cell->cur_batch >= 0);
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + length > CELL_BATCH_BUFFER_SIZE) {
+   if (size + length > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + length <= CELL_BATCH_BUFFER_SIZE);
+   assert(size + length <= CELL_BUFFER_SIZE);
 
-   memcpy(cell->batch_buffer[cell->cur_batch] + size, cmd, length);
+   memcpy(cell->buffer[cell->cur_batch] + size, cmd, length);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + length;
+   cell->buffer_size[cell->cur_batch] = size + length;
 }
 
 
@@ -142,18 +156,18 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 
    assert(cell->cur_batch >= 0);
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BATCH_BUFFER_SIZE) {
+   if (size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + bytes <= CELL_BATCH_BUFFER_SIZE);
+   assert(size + bytes <= CELL_BUFFER_SIZE);
 
-   pos = (void *) (cell->batch_buffer[cell->cur_batch] + size);
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + bytes;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 
    return pos;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index c4ba7feb3d..b4c96f465a 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -35,6 +35,9 @@
 struct cell_context;
 
 
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
 extern void
 cell_batch_flush(struct cell_context *cell);
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index 8cb0c48f40..e8020a49bc 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -254,8 +254,9 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_start_spus(cell);
 
-   for (buf = 0; buf < CELL_NUM_BATCH_BUFFERS; buf++) {
-      cell->batch_buffer_size[buf] = 0;
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
 
       /* init batch buffer status values,
        * mark 0th buffer as used, rest as free.
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 3bd88bfd5b..de65fb5e9a 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -102,12 +102,14 @@ struct cell_context
 
    uint num_spus;
 
-   uint batch_buffer_size[CELL_NUM_BATCH_BUFFERS];
-   ubyte batch_buffer[CELL_NUM_BATCH_BUFFERS][CELL_BATCH_BUFFER_SIZE] ALIGN16_ATTRIB;
-   int cur_batch;  /**< which batch buffer is being filled */
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BATCH_BUFFERS][4] ALIGN16_ATTRIB;
+   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 };
 
diff --git a/src/mesa/pipe/cell/ppu/cell_spu.c b/src/mesa/pipe/cell/ppu/cell_spu.c
index 4627bc8d1f..7c83a47e57 100644
--- a/src/mesa/pipe/cell/ppu/cell_spu.c
+++ b/src/mesa/pipe/cell/ppu/cell_spu.c
@@ -111,8 +111,8 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].cmd = &cell_global.command[i];
-      for (j = 0; j < CELL_NUM_BATCH_BUFFERS; j++) {
-         cell_global.inits[i].batch_buffers[j] = cell->batch_buffer[j];
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 0c83900a18..2097683b82 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -473,22 +473,22 @@ cmd_finish(void)
 
 
 /**
- * Tell the PPU that this SPU has finished copying a batch buffer to
+ * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
  * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_contex->buffer_status[]).
+ * main memory (in cell_context->buffer_status[]).
  */
 static void
-release_batch_buffer(uint buffer)
+release_buffer(uint buffer)
 {
    /* Evidently, using less than a 16-byte status doesn't work reliably */
    static const uint status[4] ALIGN16_ATTRIB
       = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
 
-   const uint index = 4 * (spu.init.id * CELL_NUM_BATCH_BUFFERS + buffer);
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
    uint *dst = spu.init.buffer_status + index;
 
-   ASSERT(buffer < CELL_NUM_BATCH_BUFFERS);
+   ASSERT(buffer < CELL_NUM_BUFFERS);
 
    /*
    printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n",
@@ -513,24 +513,24 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BATCH_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
+   uint buffer[CELL_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
    const uint usize = size / sizeof(uint);
    uint pos;
 
    if (Debug)
       printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.batch_buffers[buf]);
+             spu.init.id, buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    size = ROUNDUP16(size);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.batch_buffers[buf],  /* src */
+           (unsigned int) spu.init.buffers[buf],  /* src */
            size,
            TAG_BATCH_BUFFER,
            0, /* tid */
@@ -538,7 +538,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   release_batch_buffer(buf);
+   release_buffer(buf);
 
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
-- 
cgit v1.2.3


From 87c8f9c5834b7345615257d0faf5200f191e8eca Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:57:13 -0700
Subject: Cell: additional debug code, misc clean-up

---
 src/mesa/pipe/cell/ppu/cell_batch.c | 52 +++++++++++++++++++++++++++++--------
 src/mesa/pipe/cell/ppu/cell_batch.h |  2 +-
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 178caa74e1..2d032fc902 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -35,7 +35,7 @@
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
-   uint buf = 0;
+   uint buf = 0, tries = 0;
 
    /* Find a buffer that's marked as free by all SPUs */
    while (1) {
@@ -50,6 +50,9 @@ cell_get_empty_buffer(struct cell_context *cell)
                for (spu = 0; spu < cell->num_spus; spu++) {
                   cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
                }
+               /*
+               printf("PPU: ALLOC BUFFER %u\n", buf);
+               */
                return buf;
             }
          }
@@ -60,11 +63,17 @@ cell_get_empty_buffer(struct cell_context *cell)
 
       /* try next buf */
       buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
    }
 }
 
 
-
 void
 cell_batch_flush(struct cell_context *cell)
 {
@@ -120,29 +129,39 @@ cell_batch_free_space(const struct cell_context *cell)
 
 
 /**
- * \param cmd  command to append
- * \param length  command size in bytes
+ * Append data to current batch.
  */
 void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length)
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   assert(length % 4 == 0);
-   assert(cell->cur_batch >= 0);
+   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + length > CELL_BUFFER_SIZE) {
+   if (size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + length <= CELL_BUFFER_SIZE);
+   assert(size + bytes <= CELL_BUFFER_SIZE);
 
-   memcpy(cell->buffer[cell->cur_batch] + size, cmd, length);
+   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
 
-   cell->buffer_size[cell->cur_batch] = size + length;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 }
 
 
@@ -153,9 +172,20 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
    uint size;
 
    ASSERT(bytes % 4 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
 
    assert(cell->cur_batch >= 0);
 
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
    size = cell->buffer_size[cell->cur_batch];
 
    if (size + bytes > CELL_BUFFER_SIZE) {
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index b4c96f465a..f4f37314a4 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -45,7 +45,7 @@ extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
 extern void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length);
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
 
 extern void *
 cell_batch_alloc(struct cell_context *cell, uint bytes);
-- 
cgit v1.2.3


From 4f0906a18a0067d7e16c4fc7602dfb280e60f420 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:57:51 -0700
Subject: Cell: If flushing for swapbuffers, wait for frame completion

---
 src/mesa/pipe/cell/ppu/cell_flush.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index b98bb566b1..cf4e676645 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -39,6 +39,9 @@ cell_flush(struct pipe_context *pipe, unsigned flags)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+      flags |= PIPE_FLUSH_WAIT;
+
    draw_flush( cell->draw );
    cell_flush_int(pipe, flags);
 }
-- 
cgit v1.2.3


From 9abbaacea6f340486a3b2bf68fbd4efa0d15a5d3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:00:27 -0700
Subject: Cell: checkpoint commit: always inline prim indexes into batch buffer

Also, explicit release-vertex-buffer command.
Lots of debug/stale code still in place...
---
 src/mesa/pipe/cell/common.h        |  12 ++++
 src/mesa/pipe/cell/ppu/cell_vbuf.c | 113 ++++++++++++++++++++++++++-----------
 src/mesa/pipe/cell/spu/spu_main.c  | 110 +++++++++++++++++++++++++-----------
 src/mesa/pipe/cell/spu/spu_main.h  |   2 +
 4 files changed, 171 insertions(+), 66 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index ce9c381907..31637ed1cc 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -75,6 +75,7 @@
 #define CELL_CMD_FINISH               3
 #define CELL_CMD_RENDER               4
 #define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
 #define CELL_CMD_STATE_FRAMEBUFFER   10
 #define CELL_CMD_STATE_DEPTH_STENCIL 11
 #define CELL_CMD_STATE_SAMPLER       12
@@ -124,7 +125,11 @@ struct cell_command_render
    uint vertex_size;  /**< bytes per vertex */
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
+#if 0
    const void *vertex_data;
+#else
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+#endif
    const ushort *index_data;
    float xmin, ymin, xmax, ymax;
    boolean inline_indexes;
@@ -132,6 +137,13 @@ struct cell_command_render
 } ALIGN16_ATTRIB;
 
 
+struct cell_command_release_verts
+{
+   int opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
 /** XXX unions don't seem to work */
 struct cell_command
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index ee572b3a51..6e12e16fe0 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,8 +40,8 @@
 
 
 /** Allow prim indexes, verts to be inlined after RENDER command */
-#define ALLOW_INLINE_INDEXES 1
-#define ALLOW_INLINE_VERTS 1
+#define ALLOW_INLINE_INDEXES 01
+#define ALLOW_INLINE_VERTS 0
 
 
 /**
@@ -55,6 +55,9 @@ struct cell_vbuf_render
    uint prim;
    uint vertex_size;
    void *vertex_buffer;
+#if 1
+   uint vertex_buf;
+#endif
 };
 
 
@@ -81,13 +84,52 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
+#if 0
    assert(!cvbr->vertex_buffer);
    cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
+#else
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   printf("%s vertex_buf = %u\n", __FUNCTION__, cvbr->vertex_buf);
+#endif
    cvbr->vertex_size = vertex_size;
    return cvbr->vertex_buffer;
 }
 
 
+static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
+#if 0
+   align_free(vertices);
+#else
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+
+   {
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
+      release->opcode = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(&cell->pipe, 0x0);/*NEW*/
+#endif
+
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+
 static void
 cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
@@ -124,7 +166,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
    }
    printf("\n");
-#elif 0
+#elif 01
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
           nr_indices, nr_vertices,
           indices[0], indices[1], indices[2]);
@@ -157,28 +199,26 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       const uint index_bytes = ROUNDUP4(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
 
+      const uint batch_size = sizeof(struct cell_command_render)
+         + index_bytes;
+
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, sizeof(*render));
+         cell_batch_alloc(cell, batch_size);
+
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
-      if (ALLOW_INLINE_INDEXES &&
-          index_bytes <= cell_batch_free_space(cell)) {
-         /* indices inlined, right after render cmd */
-         void *dst = cell_batch_alloc(cell, index_bytes);
-         memcpy(dst, indices, nr_indices * 2);
-         render->inline_indexes = TRUE;
-         render->index_data = NULL;
-      }
-      else {
-         /* indices in separate buffer */
-         render->inline_indexes = FALSE;
-         render->index_data = indices;
-         ASSERT_ALIGN16(render->index_data);
-      }
 
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
+      render->inline_indexes = TRUE;
+      render->index_data = NULL;
+
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
@@ -188,12 +228,21 @@ cell_vbuf_draw(struct vbuf_render *vbr,
          void *dst = cell_batch_alloc(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
+#if 0
          render->vertex_data = NULL;
+#else
+         render->vertex_buf = ~0;
+#endif
       }
       else {
          render->inline_verts = FALSE;
+#if 0
          render->vertex_data = vertices;
          ASSERT_ALIGN16(render->vertex_data);
+#else
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
+#endif
       }
 
 
@@ -203,27 +252,13 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->ymax = ymax;
    }
 
-#if 01
+#if 0
    /* XXX this is temporary */
    cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT);
 #endif
 }
 
 
-static void
-cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
-                           unsigned vertex_size, unsigned vertices_used)
-{
-   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
-
-   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
-   align_free(vertices);
-
-   assert(vertices == cvbr->vertex_buffer);
-   cvbr->vertex_buffer = NULL;
-}
-
-
 static void
 cell_vbuf_destroy(struct vbuf_render *vbr)
 {
@@ -244,8 +279,17 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
 
+#if 0
    cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES;
    cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE;
+#else
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
+#endif
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
@@ -255,6 +299,9 @@ cell_init_vbuf(struct cell_context *cell)
    cell->vbuf_render->base.destroy = cell_vbuf_destroy;
 
    cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
 
    cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
 }
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 2097683b82..eb979718f8 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -69,6 +69,32 @@ wait_on_mask_all(unsigned tagMask)
 }
 
 
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
 
 /**
  * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
@@ -237,13 +263,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
+      /*
       printf("SPU %u: indices at %p  vertices at %p\n",
              spu.init.id,
              render->index_data, render->vertex_data);
+      */
    }
 
    ASSERT(sizeof(*render) % 4 == 0);
+#if 0
    ASSERT_ALIGN16(render->vertex_data);
+#else
+#endif
    ASSERT_ALIGN16(render->index_data);
 
 
@@ -251,10 +282,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     ** Get vertex, index buffers if not inlined
     **/
    if (!render->inline_verts) {
+      void *src;
       ASSERT(total_vertex_bytes % 16 == 0);
 
+#if 0
+      src = render->vertex_data;
+#else
+      spu.cur_vertex_buf = render->vertex_buf;
+      src = spu.init.buffers[render->vertex_buf];
+#endif
+
       mfc_get(vertex_data,  /* dest */
-              (unsigned int) render->vertex_data,  /* src */
+              (unsigned int) src,
               total_vertex_bytes,  /* size */
               TAG_VERTEX_BUFFER,
               0, /* tid */
@@ -298,6 +337,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          /* vertices are after indexes, if inlined */
          vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
          *pos_incr = *pos_incr + total_vertex_bytes / 4;
+         spu.cur_vertex_buf = ~0;
       }
    }
 
@@ -310,6 +350,12 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       mask |= (1 << TAG_INDEX_BUFFER);
    wait_on_mask_all(mask);
 
+#if 0
+   if (!render->inline_verts) {
+      printf("SPU %u: release vbuf %u\n", spu.init.id, render->vertex_buf);
+      release_buffer(render->vertex_buf);
+   }
+#endif
 
    /**
     ** find tiles which intersect the prim bounding box
@@ -359,6 +405,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
 
+         if (indexes[j] == 0xffff) {
+            printf("index[%u] = 0xffff\n", j);
+         }
+
+         ASSERT(indexes[j] != 0xffff);
+         ASSERT(indexes[j+1] != 0xffff);
+         ASSERT(indexes[j+2] != 0xffff);
+
          v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
@@ -391,6 +445,17 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 }
 
 
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   if (Debug)
+      printf("SPU %u: RELEASE VERTS %u\n",
+             spu.init.id, spu.cur_vertex_buf);
+   ASSERT(spu.cur_vertex_buf == release->vertex_buf);
+   release_buffer(release->vertex_buf);
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -472,38 +537,6 @@ cmd_finish(void)
 }
 
 
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BUFFERS);
-
-   /*
-   printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n",
-          spu.init.id, buffer, index, dst);
-   */
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
 /**
  * Execute a batch of commands
  * The opcode param encodes the location of the buffer and its size.
@@ -538,6 +571,8 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
+   if (Debug)
+      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
    release_buffer(buf);
 
    for (pos = 0; pos < usize; /* no incr */) {
@@ -567,6 +602,15 @@ cmd_batch(uint opcode)
             pos += sizeof(*render) / 4 + pos_incr;
          }
          break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            ASSERT(sizeof(*release) == 8);
+            pos += sizeof(*release) / 4;
+         }
+         break;
       case CELL_CMD_FINISH:
          cmd_finish();
          pos += 1;
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 5bc5d9fa99..68c7263b7f 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -65,6 +65,8 @@ struct spu_global
 
    /* XXX more state to come */
 
+   uint cur_vertex_buf;
+
 } ALIGN16_ATTRIB;
 
 
-- 
cgit v1.2.3


From c4ef36dec0aa5b8cd0293a6b12689bb68ad67ac5 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:41:27 -0700
Subject: Cell: clean-up of render path

Finally removed a number of unneeded flush commands.  Vertex buffers are
allocated from the general buffer pool, freed by SPUs when done.
Still an occasional failed assertion (invalid batch buffer command)...
---
 src/mesa/pipe/cell/common.h        |  12 +---
 src/mesa/pipe/cell/ppu/cell_vbuf.c |  60 ++++++--------------
 src/mesa/pipe/cell/spu/spu_main.c  | 112 +++++++------------------------------
 src/mesa/pipe/cell/spu/spu_main.h  |   2 -
 4 files changed, 38 insertions(+), 148 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 31637ed1cc..d6e1dd4f7d 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -68,7 +68,7 @@
  * The low byte of a mailbox word contains the command opcode.
  * Remaining higher bytes are command specific.
  */
-#define CELL_CMD_OPCODE_MASK 0xf
+#define CELL_CMD_OPCODE_MASK 0xff
 
 #define CELL_CMD_EXIT                 1
 #define CELL_CMD_CLEAR_SURFACE        2
@@ -113,10 +113,6 @@ struct cell_command_clear_surface
 } ALIGN16_ATTRIB;
 
 
-#define CELL_MAX_VBUF_SIZE    (16 * 1024)
-#define CELL_MAX_VBUF_INDEXES 1024
-
-
 struct cell_command_render
 {
    uint opcode;       /**< CELL_CMD_RENDER */
@@ -125,14 +121,8 @@ struct cell_command_render
    uint vertex_size;  /**< bytes per vertex */
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
-#if 0
-   const void *vertex_data;
-#else
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-#endif
-   const ushort *index_data;
    float xmin, ymin, xmax, ymax;
-   boolean inline_indexes;
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 6e12e16fe0..b2a25d767b 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -39,9 +39,8 @@
 #include "pipe/draw/draw_vbuf.h"
 
 
-/** Allow prim indexes, verts to be inlined after RENDER command */
-#define ALLOW_INLINE_INDEXES 01
-#define ALLOW_INLINE_VERTS 0
+/** Allow vertex data to be inlined after RENDER command */
+#define ALLOW_INLINE_VERTS 1
 
 
 /**
@@ -52,12 +51,10 @@ struct cell_vbuf_render
 {
    struct vbuf_render base;
    struct cell_context *cell;
-   uint prim;
-   uint vertex_size;
-   void *vertex_buffer;
-#if 1
-   uint vertex_buf;
-#endif
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
 
@@ -84,15 +81,10 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
-#if 0
-   assert(!cvbr->vertex_buffer);
-   cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
-#else
+
    assert(cvbr->vertex_buf == ~0);
    cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
    cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
-   printf("%s vertex_buf = %u\n", __FUNCTION__, cvbr->vertex_buf);
-#endif
    cvbr->vertex_size = vertex_size;
    return cvbr->vertex_buffer;
 }
@@ -105,14 +97,13 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    struct cell_context *cell = cvbr->cell;
 
-   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
-#if 0
-   align_free(vertices);
-#else
+   /*
    printf("%s vertex_buf = %u  count = %u\n",
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
 
-   {
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
          = (struct cell_command_release_verts *)
          cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
@@ -121,8 +112,7 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    }
 
    cvbr->vertex_buf = ~0;
-   cell_flush_int(&cell->pipe, 0x0);/*NEW*/
-#endif
+   cell_flush_int(&cell->pipe, 0x0);
 
    assert(vertices == cvbr->vertex_buffer);
    cvbr->vertex_buffer = NULL;
@@ -166,7 +156,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
    }
    printf("\n");
-#elif 01
+#elif 0
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
           nr_indices, nr_vertices,
           indices[0], indices[1], indices[2]);
@@ -213,8 +203,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       /* append indices after render command */
       memcpy(render + 1, indices, nr_indices * 2);
-      render->inline_indexes = TRUE;
-      render->index_data = NULL;
 
       /* if there's room, append vertices after the indices, else leave
        * vertices in the original/separate buffer.
@@ -222,30 +210,20 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
-         render->inline_indexes &&
           vertex_bytes <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices */
          void *dst = cell_batch_alloc(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
-#if 0
-         render->vertex_data = NULL;
-#else
          render->vertex_buf = ~0;
-#endif
       }
       else {
+         /* vertex data in separate buffer */
          render->inline_verts = FALSE;
-#if 0
-         render->vertex_data = vertices;
-         ASSERT_ALIGN16(render->vertex_data);
-#else
          ASSERT(cvbr->vertex_buf >= 0);
          render->vertex_buf = cvbr->vertex_buf;
-#endif
       }
 
-
       render->xmin = xmin;
       render->ymin = ymin;
       render->xmax = xmax;
@@ -253,7 +231,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    }
 
 #if 0
-   /* XXX this is temporary */
+   /* helpful for debug */
    cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT);
 #endif
 }
@@ -279,17 +257,15 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
 
-#if 0
-   cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES;
-   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE;
-#else
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
    cell->vbuf_render->base.max_indices
       = (CELL_BUFFER_SIZE
          - sizeof(struct cell_command_render)
          - sizeof(struct cell_command_release_verts))
       / sizeof(ushort);
    cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
-#endif
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index eb979718f8..5b50ec6953 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -239,59 +239,45 @@ static void
 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 {
    /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_MAX_VBUF_SIZE] ALIGN16_ATTRIB;
-   ushort index_data[CELL_MAX_VBUF_INDEXES] ALIGN16_ATTRIB;
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
    const uint total_vertex_bytes = render->num_verts * vertex_size;
    const ubyte *vertices;
    const ushort *indexes;
-   uint mask;
    uint i, j;
 
 
    if (Debug) {
       printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u  inline_ind=%u\n",
+             "inline_vert=%u\n",
              spu.init.id,
              render->prim_type,
              render->num_verts,
              render->num_indexes,
-             render->inline_verts,
-             render->inline_indexes);
+             render->inline_verts);
 
       /*
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
-      /*
-      printf("SPU %u: indices at %p  vertices at %p\n",
-             spu.init.id,
-             render->index_data, render->vertex_data);
-      */
    }
 
    ASSERT(sizeof(*render) % 4 == 0);
-#if 0
-   ASSERT_ALIGN16(render->vertex_data);
-#else
-#endif
-   ASSERT_ALIGN16(render->index_data);
+   ASSERT(total_vertex_bytes % 16 == 0);
 
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   *pos_incr = (render->num_indexes * 2 + 3) / 4;
 
-   /**
-    ** Get vertex, index buffers if not inlined
-    **/
-   if (!render->inline_verts) {
-      void *src;
-      ASSERT(total_vertex_bytes % 16 == 0);
-
-#if 0
-      src = render->vertex_data;
-#else
-      spu.cur_vertex_buf = render->vertex_buf;
-      src = spu.init.buffers[render->vertex_buf];
-#endif
 
+   if (render->inline_verts) {
+      /* Vertices are right after indexes in batch buffer */
+      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
+      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      void *src = spu.init.buffers[render->vertex_buf];
       mfc_get(vertex_data,  /* dest */
               (unsigned int) src,
               total_vertex_bytes,  /* size */
@@ -300,63 +286,11 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
               0  /* rid */);
 
       vertices = vertex_data;
-   }
-
-   if (!render->inline_indexes) {
-      uint total_index_bytes;
-
-      *pos_incr = 0;
-
-      total_index_bytes = render->num_indexes * sizeof(ushort);
-      if (total_index_bytes < 16)
-         total_index_bytes = 16;
-      else
-         total_index_bytes = ROUNDUP16(total_index_bytes);
 
-      indexes = index_data;
-
-      /* get index data from main memory */
-      mfc_get(index_data,  /* dest */
-              (unsigned int) render->index_data,  /* src */
-              total_index_bytes,
-              TAG_INDEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-   }
-
-
-   /**
-    ** Get pointers to inlined indexes, verts, if present
-    **/
-   if (render->inline_indexes) {
-      /* indexes are right after the render command in the batch buffer */
-      indexes = (ushort *) (render + 1);
-      *pos_incr = (render->num_indexes * 2 + 3) / 4;
-
-      if (render->inline_verts) {
-         /* vertices are after indexes, if inlined */
-         vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-         *pos_incr = *pos_incr + total_vertex_bytes / 4;
-         spu.cur_vertex_buf = ~0;
-      }
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
    }
 
 
-   /* wait for vertex and/or index buffers if not inlined */
-   mask = 0x0;
-   if (!render->inline_verts)
-      mask |= (1 << TAG_VERTEX_BUFFER);
-   if (!render->inline_indexes)
-      mask |= (1 << TAG_INDEX_BUFFER);
-   wait_on_mask_all(mask);
-
-#if 0
-   if (!render->inline_verts) {
-      printf("SPU %u: release vbuf %u\n", spu.init.id, render->vertex_buf);
-      release_buffer(render->vertex_buf);
-   }
-#endif
-
    /**
     ** find tiles which intersect the prim bounding box
     **/
@@ -372,7 +306,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 #endif
 
    /* make sure any pending clears have completed */
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
    /**
@@ -405,14 +339,6 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
 
-         if (indexes[j] == 0xffff) {
-            printf("index[%u] = 0xffff\n", j);
-         }
-
-         ASSERT(indexes[j] != 0xffff);
-         ASSERT(indexes[j+1] != 0xffff);
-         ASSERT(indexes[j+2] != 0xffff);
-
          v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
@@ -450,8 +376,8 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 {
    if (Debug)
       printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, spu.cur_vertex_buf);
-   ASSERT(spu.cur_vertex_buf == release->vertex_buf);
+             spu.init.id, release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 68c7263b7f..5bc5d9fa99 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -65,8 +65,6 @@ struct spu_global
 
    /* XXX more state to come */
 
-   uint cur_vertex_buf;
-
 } ALIGN16_ATTRIB;
 
 
-- 
cgit v1.2.3


From 1cbe803922e1129d1077bcc1eb0640bf9204641d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:41:51 -0700
Subject: Cell: remove unneeded flush(), dead code

---
 src/mesa/pipe/cell/ppu/cell_clear.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c
index e01640b994..e61bfd9b0f 100644
--- a/src/mesa/pipe/cell/ppu/cell_clear.c
+++ b/src/mesa/pipe/cell/ppu/cell_clear.c
@@ -48,7 +48,6 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
 {
    struct cell_context *cell = cell_context(pipe);
-   /*uint i;*/
    uint surfIndex;
 
    if (!cell->cbuf_map[0])
@@ -61,29 +60,7 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       surfIndex = 0;
    }
 
-#if 0
-   for (i = 0; i < cell->num_spus; i++) {
-#if 1
-      uint clr = clearValue;
-      if (surfIndex == 0) {
-         /* XXX debug: clear color varied per-SPU to visualize tiles */
-         if ((clr & 0xff) == 0)
-            clr |= 64 + i * 8;
-         if ((clr & 0xff00) == 0)
-            clr |= (64 + i * 8) << 8;
-         if ((clr & 0xff0000) == 0)
-            clr |= (64 + i * 8) << 16;
-         if ((clr & 0xff000000) == 0)
-            clr |= (64 + i * 8) << 24;
-      }
-      cell_global.command[i].clear.value = clr;
-#else
-      cell_global.command[i].clear.value = clearValue;
-#endif
-      cell_global.command[i].clear.surface = surfIndex;
-      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_CLEAR_SURFACE);
-   }
-#else
+
    {
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
@@ -92,9 +69,4 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
-#endif
-
-   /* XXX temporary */
-   cell_flush(&cell->pipe, 0x0);
-
 }
-- 
cgit v1.2.3


From ca85eed771c1fb1c662fb8eea2535601e73c437d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:20:47 -0700
Subject: Cell: make sure state commands aren't split across batches

---
 src/mesa/pipe/cell/ppu/cell_state_emit.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index dbca900c35..6776ec88c7 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -33,6 +33,17 @@
 
 
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint *dst = (uint *) cell_batch_alloc(cell, sizeof(uint) + state_size);
+   *dst = cmd;
+   memcpy(dst + 1, state, state_size);
+}
+
+
+
 void
 cell_emit_state(struct cell_context *cell)
 {
@@ -51,22 +62,18 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      uint cmd = CELL_CMD_STATE_DEPTH_STENCIL;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->depth_stencil,
-                        sizeof(struct pipe_depth_stencil_alpha_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
+                     cell->depth_stencil,
+                     sizeof(struct pipe_depth_stencil_alpha_state));
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-      uint cmd = CELL_CMD_STATE_SAMPLER;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->sampler[0],
-                        sizeof(struct pipe_sampler_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER,
+                     cell->sampler[0], sizeof(struct pipe_sampler_state));
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
-      uint cmd = CELL_CMD_STATE_VERTEX_INFO;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, &cell->vertex_info, sizeof(struct vertex_info));
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
    }
 }
-- 
cgit v1.2.3


From 5b4d14bf1c6363f82660a53ca9505e55696084b0 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:21:25 -0700
Subject: Cell: additional assertions

---
 src/mesa/pipe/cell/spu/spu_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 5b50ec6953..62f6a357ba 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -441,9 +441,12 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   if (Debug)
+   if (Debug) {
       printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
              vinfo->num_attribs);
+   }
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
 }
 
-- 
cgit v1.2.3


From ab36a9346cb1263fdeac492c0df986ab8cfb38b3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 12:35:20 -0700
Subject: Added d/D keys to change viewing distance, 'a' to toggle animation

---
 progs/demos/gears.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/progs/demos/gears.c b/progs/demos/gears.c
index ab9bc00742..7abbd3670b 100644
--- a/progs/demos/gears.c
+++ b/progs/demos/gears.c
@@ -27,6 +27,9 @@ static GLint T0 = 0;
 static GLint Frames = 0;
 static GLint autoexit = 0;
 static GLint win = 0;
+static GLboolean Visible = GL_TRUE;
+static GLboolean Animate = GL_TRUE;
+static GLfloat viewDist = 40.0;
 
 
 /**
@@ -179,6 +182,9 @@ draw(void)
   glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
   glPushMatrix();
+
+    glTranslatef(0.0, 0.0, -viewDist);
+
     glRotatef(view_rotx, 1.0, 0.0, 0.0);
     glRotatef(view_roty, 0.0, 1.0, 0.0);
     glRotatef(view_rotz, 0.0, 0.0, 1.0);
@@ -240,6 +246,15 @@ idle(void)
   glutPostRedisplay();
 }
 
+static void
+update_idle_func(void)
+{
+  if (Visible && Animate)
+    glutIdleFunc(idle);
+  else
+    glutIdleFunc(NULL);
+}
+
 /* change view angle, exit upon ESC */
 /* ARGSUSED1 */
 static void
@@ -252,6 +267,16 @@ key(unsigned char k, int x, int y)
   case 'Z':
     view_rotz -= 5.0;
     break;
+  case 'd':
+     viewDist += 1.0;
+     break;
+  case 'D':
+     viewDist -= 1.0;
+     break;
+  case 'a':
+     Animate = !Animate;
+     update_idle_func();
+     break;
   case 27:  /* Escape */
     cleanup();
     exit(0);
@@ -297,8 +322,6 @@ reshape(int width, int height)
   glLoadIdentity();
   glFrustum(-1.0, 1.0, -h, h, 5.0, 60.0);
   glMatrixMode(GL_MODELVIEW);
-  glLoadIdentity();
-  glTranslatef(0.0, 0.0, -40.0);
 }
 
 static void
@@ -351,13 +374,12 @@ init(int argc, char *argv[])
   }
 }
 
+
 static void 
 visible(int vis)
 {
-  if (vis == GLUT_VISIBLE)
-    glutIdleFunc(idle);
-  else
-    glutIdleFunc(NULL);
+   Visible = vis;
+   update_idle_func();
 }
 
 int main(int argc, char *argv[])
@@ -375,6 +397,7 @@ int main(int argc, char *argv[])
   glutKeyboardFunc(key);
   glutSpecialFunc(special);
   glutVisibilityFunc(visible);
+  update_idle_func();
 
   glutMainLoop();
   return 0;             /* ANSI C requires main to return int. */
-- 
cgit v1.2.3


From 0abef84995acd01d7155f4fc851c9c528a1cfa73 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 12:41:47 -0700
Subject: push out far clip plane to 200

---
 progs/demos/gears.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/progs/demos/gears.c b/progs/demos/gears.c
index 7abbd3670b..2a9fefefb5 100644
--- a/progs/demos/gears.c
+++ b/progs/demos/gears.c
@@ -320,7 +320,7 @@ reshape(int width, int height)
   glViewport(0, 0, (GLint) width, (GLint) height);
   glMatrixMode(GL_PROJECTION);
   glLoadIdentity();
-  glFrustum(-1.0, 1.0, -h, h, 5.0, 60.0);
+  glFrustum(-1.0, 1.0, -h, h, 5.0, 200.0);
   glMatrixMode(GL_MODELVIEW);
 }
 
-- 
cgit v1.2.3


From 3c9e26e0fa6d97f50b74745ffad9215f68607fe3 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 20:57:16 +0000
Subject: gallium: fix typos in hardwired fetch path

---
 src/mesa/pipe/draw/draw_vertex_fetch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index afdf1971d2..89e4c256a7 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -396,7 +396,7 @@ static void fetch_xyz_rgb_st( struct draw_context *draw,
 
       {
 	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 float *out = &machine->Inputs[2].xyzw[0].f[i];
 	 out[0] = in[0];
 	 out[4] = in[1];
 	 out[8] = 0.0f;
@@ -500,7 +500,7 @@ void draw_update_vertex_fetch( struct draw_context *draw )
    case 3:
       if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
 	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
-	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	  draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
 	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
       break;
    default:
-- 
cgit v1.2.3


From eb085014035a55e2be5e582dcc1e0bfcae771ba5 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 20:57:58 +0000
Subject: gallium: remove dead vars, code

---
 src/mesa/pipe/draw/draw_vf_sse.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index b238b542e7..066d6c0b7b 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -343,8 +343,6 @@ static boolean build_vertex_emit( struct x86_program *p )
    struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
    struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
    struct x86_reg temp = x86_make_reg(file_XMM, 0);
-   struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
-   struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
    uint8_t *fixup, *label;
 
    /* Push a few regs?
@@ -371,7 +369,6 @@ static boolean build_vertex_emit( struct x86_program *p )
    
    /* always load, needed or not:
     */
-   sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
    sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
 
    /* Note address for loop jump */
-- 
cgit v1.2.3


From 5ec70aa03b59c514cba1fe8dae09118250fb15d6 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 09:34:09 +0900
Subject: gallium: Remove direct dependencies to mesa internals.

_mesa_exec_free is still being called. More invasive refactoring is necessary to clean it out.
---
 src/mesa/pipe/draw/draw_vf.c         | 20 ++++++++------
 src/mesa/pipe/draw/draw_vf.h         | 52 +++++-------------------------------
 src/mesa/pipe/draw/draw_vf_generic.c |  9 ++++---
 src/mesa/pipe/draw/draw_vf_sse.c     | 14 +++++-----
 4 files changed, 31 insertions(+), 64 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index deedfc7bc7..d36f6293b1 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -25,17 +25,20 @@
  *    Keith Whitwell <keithw@tungstengraphics.com>
  */
 
-#include "glheader.h"
-#include "context.h"
-#include "colormac.h"
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
 
 #include "draw_vf.h"
 
+
 #define DBG 0
 
 
+/* TODO: remove this */
+extern void 
+_mesa_exec_free( void *addr );
+
 
 static boolean match_fastpath( struct draw_vertex_fetch *vf,
 				 const struct draw_vf_fastpath *fp)
@@ -88,7 +91,7 @@ void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
    fastpath->match_strides = match_strides;
    fastpath->func = vf->emit;
    fastpath->attr = (struct draw_vf_attr_type *)
-      _mesa_malloc(vf->attr_count * sizeof(fastpath->attr[0]));
+      MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
 
    for (i = 0; i < vf->attr_count; i++) {
       fastpath->attr[i].format = vf->attr[i].format;
@@ -156,7 +159,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
    unsigned offset = 0;
    unsigned i, j;
 
-   assert(nr < DRAW_VF_ATTRIB_MAX);
+   assert(nr < PIPE_ATTRIB_MAX);
 
    memset(vf->lookup, 0, sizeof(vf->lookup));
 
@@ -200,7 +203,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
 
-
+#if 0
 /* Set attribute pointers, adjusted for start position:
  */
 void draw_vf_set_sources( struct draw_vertex_fetch *vf,
@@ -223,6 +226,7 @@ void draw_vf_set_sources( struct draw_vertex_fetch *vf,
       a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
    }
 }
+#endif
 
 
 /* Set attribute pointers, adjusted for start position:
@@ -260,7 +264,7 @@ struct draw_vertex_fetch *draw_vf_create( void )
    struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
    unsigned i;
 
-   for (i = 0; i < DRAW_VF_ATTRIB_MAX; i++)
+   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
       vf->attr[i].vf = vf;
 
    vf->identity[0] = 0.0;
@@ -271,7 +275,7 @@ struct draw_vertex_fetch *draw_vf_create( void )
    vf->codegen_emit = NULL;
 
 #ifdef USE_SSE_ASM
-   if (!_mesa_getenv("MESA_NO_CODEGEN"))
+   if (!GETENV("MESA_NO_CODEGEN"))
       vf->codegen_emit = draw_vf_generate_sse_emit;
 #endif
 
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index c6a8fe0d53..7d90f35b0f 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -29,44 +29,11 @@
 #define DRAW_VF_H
 
 
-#include "math/m_vector.h"
-
 #include "pipe/p_compiler.h"
-#include "draw_vertex.h"
+#include "pipe/p_state.h"
 
+#include "draw_vertex.h"
 
-enum {
-   DRAW_VF_ATTRIB_POS = 0,
-   DRAW_VF_ATTRIB_WEIGHT = 1,
-   DRAW_VF_ATTRIB_NORMAL = 2,
-   DRAW_VF_ATTRIB_COLOR0 = 3,
-   DRAW_VF_ATTRIB_COLOR1 = 4,
-   DRAW_VF_ATTRIB_FOG = 5,
-   DRAW_VF_ATTRIB_COLOR_INDEX = 6,
-   DRAW_VF_ATTRIB_EDGEFLAG = 7,
-   DRAW_VF_ATTRIB_TEX0 = 8,
-   DRAW_VF_ATTRIB_TEX1 = 9,
-   DRAW_VF_ATTRIB_TEX2 = 10,
-   DRAW_VF_ATTRIB_TEX3 = 11,
-   DRAW_VF_ATTRIB_TEX4 = 12,
-   DRAW_VF_ATTRIB_TEX5 = 13,
-   DRAW_VF_ATTRIB_TEX6 = 14,
-   DRAW_VF_ATTRIB_TEX7 = 15,
-   DRAW_VF_ATTRIB_VAR0 = 16,
-   DRAW_VF_ATTRIB_VAR1 = 17,
-   DRAW_VF_ATTRIB_VAR2 = 18,
-   DRAW_VF_ATTRIB_VAR3 = 19,
-   DRAW_VF_ATTRIB_VAR4 = 20,
-   DRAW_VF_ATTRIB_VAR5 = 21,
-   DRAW_VF_ATTRIB_VAR6 = 22,
-   DRAW_VF_ATTRIB_VAR7 = 23,
-   DRAW_VF_ATTRIB_POINTSIZE = 24,
-   DRAW_VF_ATTRIB_BFC0 = 25,
-   DRAW_VF_ATTRIB_BFC1 = 26,
-   DRAW_VF_ATTRIB_CLIP_POS = 27,
-   DRAW_VF_ATTRIB_VERTEX_HEADER = 28,
-   DRAW_VF_ATTRIB_MAX = 29
-};
 
 enum draw_vf_attr_format {
    DRAW_EMIT_1F,
@@ -101,10 +68,12 @@ draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
                                unsigned nr, 
                                unsigned vertex_stride );
 
+#if 0
 void 
 draw_vf_set_sources( struct draw_vertex_fetch *vf,
 		     GLvector4f * const attrib[],
-		     unsigned start ); 
+		     unsigned start );
+#endif
 
 void 
 draw_vf_set_data( struct draw_vertex_fetch *vf,
@@ -115,13 +84,6 @@ draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
 		       unsigned count,
 		       void *dest );
 
-void 
-draw_vf_get_attr( struct draw_vertex_fetch *vf,
-		  const void *vertex,
-		  GLenum attr, 
-		  const float *dflt,
-		  float *dest );
-
 struct draw_vertex_fetch *
 draw_vf_create( void );
 
@@ -174,11 +136,11 @@ struct draw_vf_attr
 
 struct draw_vertex_fetch
 {
-   struct draw_vf_attr attr[DRAW_VF_ATTRIB_MAX];
+   struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
    unsigned attr_count;
    unsigned vertex_stride;
 
-   struct draw_vf_attr *lookup[DRAW_VF_ATTRIB_MAX];
+   struct draw_vf_attr *lookup[PIPE_ATTRIB_MAX];
    
    draw_vf_emit_func emit;
 
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 343428d26c..a16eb456b7 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -26,12 +26,13 @@
  *    Keith Whitwell <keithw@tungstengraphics.com>
  */
 
-#include "glheader.h"
-#include "context.h"
-#include "colormac.h"
+
+#include <assert.h>
+
 #include "simple_list.h"
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
 
 #include "draw_vf.h"
 
@@ -94,7 +95,7 @@ static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, co
 static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
 {
    (void) a; (void) v; (void) in;
-   _mesa_exit(1);
+   assert(0);
 }
 
 static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 066d6c0b7b..4036ded1d8 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -25,15 +25,14 @@
  *    Keith Whitwell <keithw@tungstengraphics.com>
  */
 
-#include "glheader.h"
-#include "colormac.h"
+
 #include "simple_list.h"
-#include "enums.h"
 
 #include "pipe/p_compiler.h"
 
 #include "draw_vf.h"
 
+
 #if defined(USE_SSE_ASM)
 
 #include "x86/rtasm/x86sse.h"
@@ -450,7 +449,8 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 else {
-	    _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    fprintf(stderr, "Can't emit 1ub %x %x %d\n", 
+	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
 	    return FALSE;
 	 }
 	 break;
@@ -495,7 +495,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    j++;		/* NOTE: two attrs consumed */
 	 }
 	 else {
-	    _mesa_printf("Can't emit 3ub\n");
+	    fprintf(stderr, "Can't emit 3ub\n");
 	 }
 	 return FALSE;	/* add this later */
 	 break;
@@ -528,7 +528,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       default:
-	 _mesa_printf("unknown a[%d].format %d\n", j, a->format);
+	 fprintf(stderr, "unknown a[%d].format %d\n", j, a->format);
 	 return FALSE;	/* catch any new opcodes */
       }
       
@@ -577,7 +577,7 @@ void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
       return;
    }
 
-   _mesa_memset(&p, 0, sizeof(p));
+   memset(&p, 0, sizeof(p));
 
    p.vf = vf;
    p.inputs_safe = 0;		/* for now */
-- 
cgit v1.2.3


From d5dd52aea826c4b6a417d102ecdeae8c713e81f6 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 09:41:21 +0900
Subject: gallium: Use CALLOC for pb_buffer to ensure that all fields of
 pipe_buffer are initiallized.

---
 src/mesa/pipe/pipebuffer/pb_buffer_malloc.c | 3 +--
 src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c     | 2 ++
 src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
index fc83a00f36..2151f1d691 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
@@ -107,10 +107,9 @@ pb_malloc_buffer_create(size_t size,
 {
    struct malloc_buffer *buf;
    
-   /* TODO: accept an alignment parameter */
    /* TODO: do a single allocation */
    
-   buf = (struct malloc_buffer *)MALLOC(sizeof(struct malloc_buffer));
+   buf = CALLOC_STRUCT(malloc_buffer);
    if(!buf)
       return NULL;
    
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
index 2694f57bca..a2657dac59 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
@@ -399,6 +399,8 @@ mm_buffer_destroy(struct pb_buffer *buf)
    struct mm_buffer *mm_buf = mm_buffer(buf);
    struct mm_pb_manager *mm = mm_buf->mgr;
    
+   assert(buf->base.refcount == 0);
+   
    _glthread_LOCK_MUTEX(mm->mutex);
    mmFreeMem(mm_buf->block);
    FREE(buf);
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
index 7c29954112..f80c7e34c0 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
@@ -258,7 +258,7 @@ pool_bufmgr_create(struct pb_manager *provider,
    if(!pool->map)
       goto failure;
 
-   pool->bufs = (struct pool_buffer *) MALLOC(numBufs * sizeof(*pool->bufs));
+   pool->bufs = (struct pool_buffer *)CALLOC(numBufs, sizeof(*pool->bufs));
    if (!pool->bufs)
       goto failure;
 
-- 
cgit v1.2.3


From d6667171dca5999bef0693963634ecda74c32d5a Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 09:42:03 +0900
Subject: gallium: Use GALLIUM_ prefix for env vars.

---
 src/mesa/pipe/draw/draw_vf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index d36f6293b1..4fc2312ad1 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -275,7 +275,7 @@ struct draw_vertex_fetch *draw_vf_create( void )
    vf->codegen_emit = NULL;
 
 #ifdef USE_SSE_ASM
-   if (!GETENV("MESA_NO_CODEGEN"))
+   if (!GETENV("GALLIUM_NO_CODEGEN"))
       vf->codegen_emit = draw_vf_generate_sse_emit;
 #endif
 
-- 
cgit v1.2.3


From deaa895fe241cfeab6f390791d462390ff1d1560 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 12:46:05 -0700
Subject: Cell: re-enable bounding boxes

The geometry bounding box is used to restrict rasterization to just those
tiles that are relevant.
Note another dummy field had to be added to the cell_command_render struct.
Apparently, every 4th word in a struct is susceptible to corruption in some
circumstances.  Might be a compiler bug.
---
 src/mesa/pipe/cell/common.h        |  2 +-
 src/mesa/pipe/cell/ppu/cell_vbuf.c |  4 ++++
 src/mesa/pipe/cell/spu/spu_main.c  | 30 +++++++++++++++++++-----------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d6e1dd4f7d..5e32b209e6 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -122,7 +122,7 @@ struct cell_command_render
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-   float xmin, ymin, xmax, ymax;
+   float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index b2a25d767b..9f737287ad 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -180,6 +180,10 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       if (v[1] > ymax)
          ymax = v[1];
    }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
 
    if (cvbr->prim != PIPE_PRIM_TRIANGLES)
       return; /* only render tris for now */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 62f6a357ba..c2b05ed5a2 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -200,7 +200,7 @@ tile_bounding_box(const struct cell_command_render *render,
                   uint *txmin, uint *tymin,
                   uint *box_num_tiles, uint *box_width_tiles)
 {
-#if 1
+#if 0
    /* Debug: full-window bounding box */
    uint txmax = spu.fb.width_tiles - 1;
    uint tymax = spu.fb.height_tiles - 1;
@@ -223,13 +223,24 @@ tile_bounding_box(const struct cell_command_render *render,
    *box_num_tiles = *box_width_tiles * box_height_tiles;
 #endif
 #if 0
-   printf("Render bounds: %g, %g  ...  %g, %g\n",
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
           render->xmin, render->ymin, render->xmax, render->ymax);
-   printf("Render tiles:  %u, %u .. %u, %u\n", *txmin, *tymin, txmax, tymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
 #endif
 }
 
 
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
 /**
  * Render primitives
  * \param pos_incr  returns value indicating how may words to skip after
@@ -295,15 +306,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     ** find tiles which intersect the prim bounding box
     **/
    uint txmin, tymin, box_width_tiles, box_num_tiles;
-#if 0
    tile_bounding_box(render, &txmin, &tymin,
                      &box_num_tiles, &box_width_tiles);
-#else
-   txmin = 0;
-   tymin = 0;
-   box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   box_width_tiles = spu.fb.width_tiles;
-#endif
+
 
    /* make sure any pending clears have completed */
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
@@ -312,13 +317,16 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    /**
     ** loop over tiles, rendering tris
     **/
-   for (i = spu.init.id; i < box_num_tiles; i += spu.init.num_spus) {
+   for (i = 0; i < box_num_tiles; i++) {
       const uint tx = txmin + i % box_width_tiles;
       const uint ty = tymin + i / box_width_tiles;
 
       ASSERT(tx < spu.fb.width_tiles);
       ASSERT(ty < spu.fb.height_tiles);
 
+      if (!my_tile(tx, ty))
+         continue;
+
       /* Start fetching color/z tiles.  We'll wait for completion when
        * we need read/write to them later in triangle rasterization.
        */
-- 
cgit v1.2.3


From 7c596b80110da42435f8f0714d6f21b760f11c4f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 13:02:11 -0700
Subject: Cell: emit state in cell_clear_surface() if dirty.

Without this a program that does nothing but glClear() doesn't work.  We need
the framebuffer state.
---
 src/mesa/pipe/cell/ppu/cell_clear.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c
index e61bfd9b0f..07b908eec5 100644
--- a/src/mesa/pipe/cell/ppu/cell_clear.c
+++ b/src/mesa/pipe/cell/ppu/cell_clear.c
@@ -50,6 +50,10 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
    struct cell_context *cell = cell_context(pipe);
    uint surfIndex;
 
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+
    if (!cell->cbuf_map[0])
       cell->cbuf_map[0] = pipe_surface_map(ps);
 
-- 
cgit v1.2.3


From 2f868411a209d909f3ea8f29a317b7327fe6f88a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:22:12 -0700
Subject: Cell: initial texture cache/sampling code

---
 src/mesa/pipe/cell/spu/spu_texture.c | 139 +++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_texture.h |  43 +++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 src/mesa/pipe/cell/spu/spu_texture.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_texture.h

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..6d566a5006
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -0,0 +1,139 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+
+
+/**
+ * Number of texture tiles to cache.
+ * Note that this will probably be the largest consumer of SPU local store/
+ * memory for this driver!
+ */
+#define CACHE_SIZE 16
+
+static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
+
+static int tex_tile_x[CACHE_SIZE], tex_tile_y[CACHE_SIZE];
+
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   /* XXX memset? */
+   uint i;
+   for (i = 0; i < CACHE_SIZE; i++)
+      tex_tile_x[i] = tex_tile_y[i] = -1;
+}
+
+
+/**
+ * Return the cache pos/index which corresponds to texel (i,j)
+ */
+static INLINE uint
+cache_pos(uint i, uint j)
+{
+   uint tx = i / TILE_SIZE;
+   uint ty = j / TILE_SIZE;
+   uint pos = (tx + ty * 4) % CACHE_SIZE;
+   return pos;
+}
+
+
+/**
+ * Make sure the tile for texel (i,j) is present, return its position/index
+ * in the cache.
+ */
+static uint
+get_tex_tile(uint i, uint j)
+{
+   const int tx = i / TILE_SIZE;
+   const int ty = j / TILE_SIZE;
+   const uint pos = cache_pos(i, j);
+
+   if (tex_tile_x[pos] != tx || tex_tile_y[pos] != ty) {
+      /* texture cache miss, fetch tile from main memory */
+      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
+      const uint bytes_per_tile = sizeof(tile_t);
+      const void *src = (const ubyte *) spu.texture.start
+         + (ty * tiles_per_row + tx) * bytes_per_tile;
+
+      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
+             spu.init.id, tx, ty, pos,
+             tex_tile_x[pos], tex_tile_y[pos]);
+#if 0
+      printf("SPU %u: get tex tile from %p to %p\n",
+             spu.init.id, src, tex_tiles[pos].t32);
+#endif
+
+      ASSERT_ALIGN16(tex_tiles[pos].t32);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(tex_tiles[pos].t32,  /* dest */
+              (unsigned int) src,
+              bytes_per_tile,      /* size */
+              TAG_TEXTURE_TILE,
+              0, /* tid */
+              0  /* rid */);
+
+      wait_on_mask(1 << TAG_TEXTURE_TILE);
+
+      tex_tile_x[pos] = tx;
+      tex_tile_y[pos] = ty;
+   }
+   else {
+#if 0
+      printf("SPU %u: tex cache HIT at %d, %d\n",
+             spu.init.id, tx, ty);
+#endif
+   }
+
+   return pos;
+}
+
+
+/**
+ * Get texture sample at texcoord.
+ * XXX this is extremely primitive for now.
+ */
+uint
+sample_texture(const float *texcoord)
+{
+   /* wrap/repeat */
+   uint i = (uint) (texcoord[0] * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (texcoord[1] * spu.texture.height) % spu.texture.height;
+   uint pos = get_tex_tile(i, j);
+   uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
+   return texel;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..b75b7ac44f
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern uint
+sample_texture(const float *texcoord);
+
+
+#endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 948dc8ad24d554ab23bea97aa3e405c4f6ad47c6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:23:44 -0700
Subject: Cell: basic texture mapping

Texture images are tiled in PPU code.  SPUs use a texture cache for getting
texels from textures.
This is very rough code, but demos/texcyl.c works.
---
 src/mesa/pipe/cell/common.h                 | 10 +++-
 src/mesa/pipe/cell/ppu/cell_context.h       |  5 +-
 src/mesa/pipe/cell/ppu/cell_state_emit.c    | 12 +++-
 src/mesa/pipe/cell/ppu/cell_state_sampler.c | 10 +++-
 src/mesa/pipe/cell/ppu/cell_texture.c       | 87 +++++++++++++++++++++++++++++
 src/mesa/pipe/cell/ppu/cell_texture.h       |  6 ++
 src/mesa/pipe/cell/spu/Makefile             |  1 +
 src/mesa/pipe/cell/spu/spu_main.c           | 17 ++++++
 src/mesa/pipe/cell/spu/spu_main.h           |  3 +
 src/mesa/pipe/cell/spu/spu_tri.c            | 60 ++++++++++++--------
 10 files changed, 183 insertions(+), 28 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 5e32b209e6..f0d48ff403 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -79,7 +79,8 @@
 #define CELL_CMD_STATE_FRAMEBUFFER   10
 #define CELL_CMD_STATE_DEPTH_STENCIL 11
 #define CELL_CMD_STATE_SAMPLER       12
-#define CELL_CMD_STATE_VERTEX_INFO   13
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
 
 
 #define CELL_NUM_BUFFERS 4
@@ -134,6 +135,13 @@ struct cell_command_release_verts
 };
 
 
+struct cell_command_texture
+{
+   void *start;         /**< Address in main memory */
+   uint width, height;
+};
+
+
 /** XXX unions don't seem to work */
 struct cell_command
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index de65fb5e9a..7d234f3e45 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -76,7 +76,7 @@ struct cell_context
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
-   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct cell_texture *texture[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
    struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
@@ -84,6 +84,9 @@ struct cell_context
    ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
    ubyte *zsbuf_map;
 
+   struct pipe_surface *tex_surf;
+   uint *tex_map;
+
    uint dirty;
 
    /** The primitive drawing context */
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 6776ec88c7..391ff454ac 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -30,7 +30,7 @@
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_batch.h"
-
+#include "cell_texture.h"
 
 
 static void
@@ -72,6 +72,16 @@ cell_emit_state(struct cell_context *cell)
                      cell->sampler[0], sizeof(struct pipe_sampler_state));
    }
 
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      struct cell_command_texture texture;
+      texture.start = cell->texture[0]->tiled_data;
+      texture.width = cell->texture[0]->base.width[0];
+      texture.height = cell->texture[0]->base.height[0];
+
+      emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
+                     &texture, sizeof(struct cell_command_texture));
+   }
+
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
       emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
                      &cell->vertex_info, sizeof(struct vertex_info));
diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
index ae1eeb4620..317f7603bb 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
@@ -30,12 +30,10 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
-#if 0
 #include "cell_texture.h"
-#include "cell_tile_cache.h"
-#endif
 
 
 void *
@@ -53,6 +51,8 @@ cell_bind_sampler_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    cell->sampler[unit] = (struct pipe_sampler_state *)sampler;
 
@@ -76,7 +76,11 @@ cell_set_sampler_texture(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->texture[sampler] = texture;
 
+   cell_update_texture_mapping(cell);
+
    cell->dirty |= CELL_NEW_TEXTURE;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index 0a8190d983..acbe4c79f0 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -163,3 +163,90 @@ cell_get_tex_surface(struct pipe_context *pipe,
    }
    return ps;
 }
+
+
+
+static void
+tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = h / tile_size, w_t = w / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* fill in tile (i, j) */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         for (i = 0; i < tile_size; i++) {
+            for (j = 0; j < tile_size; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               *tdst++ = src[srci * h + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
+static void
+cell_tile_texture(struct cell_context *cell,
+                  struct cell_texture *texture)
+{
+   uint face = 0, level = 0, zslice = 0;
+   struct pipe_surface *surf;
+   const uint w = texture->base.width[0], h = texture->base.height[0];
+   const uint *src;
+
+   /* temporary restrictions: */
+   assert(w >= TILE_SIZE);
+   assert(h >= TILE_SIZE);
+   assert(w % TILE_SIZE == 0);
+   assert(h % TILE_SIZE == 0);
+
+   surf = cell_get_tex_surface(&cell->pipe, &texture->base, face, level, zslice);
+   ASSERT(surf);
+
+   src = (const uint *) pipe_surface_map(surf);
+
+   if (texture->tiled_data) {
+      align_free(texture->tiled_data);
+   }
+   texture->tiled_data = align_malloc(w * h * 4, 16);
+
+   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+
+   pipe_surface_unmap(surf);
+
+   pipe_surface_reference(&surf, NULL);
+}
+
+
+
+void
+cell_update_texture_mapping(struct cell_context *cell)
+{
+   uint face = 0, level = 0, zslice = 0;
+
+   cell_tile_texture(cell, cell->texture[0]);
+#if 0
+   if (cell->tex_surf && cell->tex_map) {
+      pipe_surface_unmap(cell->tex_surf);
+      cell->tex_map = NULL;
+   }
+
+   /* XXX free old surface */
+
+   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
+                                         &cell->texture[0]->base,
+                                         face, level, zslice);
+
+   cell->tex_map = pipe_surface_map(cell->tex_surf);
+#endif
+}
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h
index ef5808c086..bd434c8776 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.h
+++ b/src/mesa/pipe/cell/ppu/cell_texture.h
@@ -46,6 +46,8 @@ struct cell_texture
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
+
+   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
@@ -70,4 +72,8 @@ cell_get_tex_surface(struct pipe_context *pipe,
                      unsigned face, unsigned level, unsigned zslice);
 
 
+extern void
+cell_update_texture_mapping(struct cell_context *cell);
+
+
 #endif /* CELL_TEXTURE */
diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 417ae1b072..011fdcefe3 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -17,6 +17,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_texture.c \
 	spu_tile.c \
 	spu_tri.c
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index c2b05ed5a2..5a5b17dd89 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 #include <spu_mfcio.h>
 
 #include "spu_main.h"
+#include "spu_texture.h"
 #include "spu_tri.h"
 #include "spu_tile.h"
 #include "pipe/cell/common.h"
@@ -446,6 +447,17 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 }
 
 
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   if (Debug)
+      printf("SPU %u: TEXTURE at %p  size %u x %u\n",
+             spu.init.id, texture->start, texture->width, texture->height);
+
+   memcpy(&spu.texture, texture, sizeof(*texture));
+}
+
+
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
@@ -561,6 +573,10 @@ cmd_batch(uint opcode)
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
          pos += (1 + sizeof(struct pipe_sampler_state) / 4);
          break;
+      case CELL_CMD_STATE_TEXTURE:
+         cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_command_texture) / 4);
+         break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct vertex_info) / 4);
@@ -656,6 +672,7 @@ one_time_init(void)
 {
    memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status));
    memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z));
+   invalidate_tex_cache();
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 5bc5d9fa99..480c54ebd0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -60,6 +60,7 @@ struct spu_global
    struct pipe_depth_stencil_alpha_state depth_stencil;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct cell_command_texture texture;
 
    struct vertex_info vertex_info;
 
@@ -84,6 +85,8 @@ extern struct spu_global spu;
 #define TAG_INDEX_BUFFER      16
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
+#define TAG_TEXTURE_TILE      19
+
 
 
 extern void
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 3d0d106c10..aad28f1036 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -33,6 +33,7 @@
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
 #include "spu_main.h"
+#include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
 
@@ -362,9 +363,24 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
    /* Cell: "write" quad fragments to the tile by setting prim color */
    const int ix = x - setup->cliprect_minx;
    const int iy = y - setup->cliprect_miny;
-   float colors[4][4];
-
-   eval_coeff(setup, 1, (float) x, (float) y, colors);
+   uint colors[4];  /* indexed by QUAD_x */
+
+   if (spu.texture.start) {
+      float texcoords[4][4];
+      uint i;
+      eval_coeff(setup, 2, (float) x, (float) y, texcoords);
+      for (i = 0; i < 4; i++) {
+         colors[i] = sample_texture(texcoords[i]);
+      }
+   }
+   else {
+      float fcolors[4][4];
+      eval_coeff(setup, 1, (float) x, (float) y, fcolors);
+      colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
+      colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
+      colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
+      colors[QUAD_BOTTOM_RIGHT] = pack_color(fcolors[QUAD_BOTTOM_RIGHT]);
+   }
 
    if (spu.depth_stencil.depth.enabled) {
       mask &= do_depth_test(setup, x, y, mask);
@@ -382,13 +398,13 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
       tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
 
       if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = pack_color(colors[QUAD_TOP_LEFT]);
+         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = pack_color(colors[QUAD_TOP_RIGHT]);
+         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = pack_color(colors[QUAD_BOTTOM_LEFT]);
+         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = pack_color(colors[QUAD_BOTTOM_RIGHT]);
+         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
    }
 #endif
 }
@@ -606,7 +622,6 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 }
 
 
-#if 0
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  * The value value comes from vertex->data[slot][i].
@@ -614,21 +629,20 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
  * \param slot  which attribute slot 
  * \param i  which component of the slot (0..3)
  */
-static void const_coeff( struct setup_stage *setup,
-			 unsigned slot,
-			 unsigned i )
+static void const_coeff(struct setup_stage *setup, uint slot)
 {
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   uint i;
+   ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-   setup->coef[slot].dadx[i] = 0;
-   setup->coef[slot].dady[i] = 0;
+   for (i = 0; i < 4; i++) {
+      setup->coef[slot].dadx[i] = 0;
+      setup->coef[slot].dady[i] = 0;
 
-   /* need provoking vertex info!
-    */
-   setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+      /* need provoking vertex info!
+       */
+      setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+   }
 }
-#endif
 
 
 /**
@@ -735,15 +749,17 @@ static void setup_tri_coefficients( struct setup_stage *setup )
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(setup, i, 2, 3);  /* slot 0, z */
+         tri_linear_coeff(setup, i, 2, 3);
          /* XXX interp W if PERSPECTIVE... */
          break;
       case INTERP_CONSTANT:
-         /* fall-through */
+         const_coeff(setup, i);
+         break;
       case INTERP_LINEAR:
-         tri_linear_coeff(setup, i, 0, 4);  /* slot 1, color */
+         tri_linear_coeff(setup, i, 0, 4);
          break;
       case INTERP_PERSPECTIVE:
+         tri_linear_coeff(setup, i, 0, 4); /* XXX temporary */
          break;
       default:
          ASSERT(0);
-- 
cgit v1.2.3


From 5f54cfaba16c2ac268472e148f1e788a9d7b2a6a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:32:23 -0700
Subject: Cell: minor optimization for flat shading

---
 src/mesa/pipe/cell/spu/spu_tri.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index aad28f1036..19a231d9c4 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -200,16 +200,35 @@ static INLINE void
 eval_coeff( struct setup_stage *setup, uint slot,
             float x, float y, float result[4][4])
 {
-   uint i;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
+   switch (spu.vertex_info.interp_mode[slot]) {
+   case INTERP_CONSTANT:
+      {
+         uint i;
+         for (i = 0; i < 4; i++) {
+            result[QUAD_TOP_LEFT][i] =
+            result[QUAD_TOP_RIGHT][i] =
+            result[QUAD_BOTTOM_LEFT][i] =
+            result[QUAD_BOTTOM_RIGHT][i] = setup->coef[slot].a0[i];
+         }
+      }
+      break;
 
-   /* loop over XYZW comps */
-   for (i = 0; i < 4; i++) {
-      result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-      result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
-      result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
-      result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
+   case INTERP_LINEAR:
+      /* fall-through, for now */
+   default:
+      {
+         uint i;
+         const float *dadx = setup->coef[slot].dadx;
+         const float *dady = setup->coef[slot].dady;
+
+         /* loop over XYZW comps */
+         for (i = 0; i < 4; i++) {
+            result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+            result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
+            result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
+            result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
+         }
+      }
    }
 }
 
-- 
cgit v1.2.3


From 7012dd9b76328b4b1f54404df1948e50f23c1fe3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:03:45 -0700
Subject: Cell: compute min index referenced in draw command, use it to reduce
 size of vertex data payload

---
 src/mesa/pipe/cell/common.h        |  2 ++
 src/mesa/pipe/cell/ppu/cell_vbuf.c | 13 +++++++++++--
 src/mesa/pipe/cell/spu/spu_main.c  | 20 ++++++++++++++++----
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index f0d48ff403..90aa46a534 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,6 +124,8 @@ struct cell_command_render
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
    float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint dummy3;
+   uint min_index;
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 9f737287ad..e63b34cf52 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -138,16 +138,23 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    struct cell_context *cell = cvbr->cell;
    float xmin, ymin, xmax, ymax;
    uint i;
-   uint nr_vertices = 0;
+   uint nr_vertices = 0, min_index = ~0;
    const void *vertices = cvbr->vertex_buffer;
    const uint vertex_size = cvbr->vertex_size;
 
    for (i = 0; i < nr_indices; i++) {
       if (indices[i] > nr_vertices)
          nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
    }
    nr_vertices++;
 
+#if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
 #if 0
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
           nr_indices, nr_vertices);
@@ -169,7 +176,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    /* compute x/y bounding box */
    xmin = ymin = 1e50;
    xmax = ymax = -1e50;
-   for (i = 0; i < nr_vertices; i++) {
+   for (i = min_index; i < nr_vertices; i++) {
       const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
       if (v[0] < xmin)
          xmin = v[0];
@@ -204,6 +211,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
+      render->min_index = min_index;
 
       /* append indices after render command */
       memcpy(render + 1, indices, nr_indices * 2);
@@ -214,6 +222,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
+          min_index == 0 &&
           vertex_bytes <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices */
          void *dst = cell_batch_alloc(cell, vertex_bytes);
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 5a5b17dd89..3c9efb4741 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -253,7 +253,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    /* we'll DMA into these buffers */
    ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
-   const uint total_vertex_bytes = render->num_verts * vertex_size;
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
@@ -289,9 +289,21 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    }
    else {
       /* Begin DMA fetch of vertex buffer */
-      void *src = spu.init.buffers[render->vertex_buf];
-      mfc_get(vertex_data,  /* dest */
-              (unsigned int) src,
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
               total_vertex_bytes,  /* size */
               TAG_VERTEX_BUFFER,
               0, /* tid */
-- 
cgit v1.2.3


From c474e0d6ed7b654ef750d088df2b26d8215f20ec Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:09:16 -0700
Subject: Cell: add a few null texture tests

---
 src/mesa/pipe/cell/ppu/cell_state_emit.c | 13 ++++++++++---
 src/mesa/pipe/cell/ppu/cell_texture.c    |  3 ++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 391ff454ac..702184416b 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -74,9 +74,16 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       struct cell_command_texture texture;
-      texture.start = cell->texture[0]->tiled_data;
-      texture.width = cell->texture[0]->base.width[0];
-      texture.height = cell->texture[0]->base.height[0];
+      if (cell->texture[0]) {
+         texture.start = cell->texture[0]->tiled_data;
+         texture.width = cell->texture[0]->base.width[0];
+         texture.height = cell->texture[0]->base.height[0];
+      }
+      else {
+         texture.start = NULL;
+         texture.width = 0;
+         texture.height = 0;
+      }
 
       emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
                      &texture, sizeof(struct cell_command_texture));
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index acbe4c79f0..2cf6022939 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -234,7 +234,8 @@ cell_update_texture_mapping(struct cell_context *cell)
 {
    uint face = 0, level = 0, zslice = 0;
 
-   cell_tile_texture(cell, cell->texture[0]);
+   if (cell->texture[0])
+      cell_tile_texture(cell, cell->texture[0]);
 #if 0
    if (cell->tex_surf && cell->tex_map) {
       pipe_surface_unmap(cell->tex_surf);
-- 
cgit v1.2.3


From e308dc4465fb3869a8423ed9608da35b426ef993 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:17:30 -0700
Subject: Cell: move cmd_render() into new spu_render.c file

---
 src/mesa/pipe/cell/spu/Makefile     |   1 +
 src/mesa/pipe/cell/spu/spu_main.c   | 206 +------------------------------
 src/mesa/pipe/cell/spu/spu_main.h   |   1 +
 src/mesa/pipe/cell/spu/spu_render.c | 240 ++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_render.h |  38 ++++++
 5 files changed, 283 insertions(+), 203 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_render.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_render.h

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 011fdcefe3..d5b30e1f27 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -17,6 +17,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
 	spu_tri.c
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 3c9efb4741..6e02f2c964 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -34,8 +34,8 @@
 #include <spu_mfcio.h>
 
 #include "spu_main.h"
+#include "spu_render.h"
 #include "spu_texture.h"
-#include "spu_tri.h"
 #include "spu_tile.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
@@ -47,7 +47,7 @@ helpful headers:
 /opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
 */
 
-static boolean Debug = FALSE;
+boolean Debug = FALSE;
 
 struct spu_global spu;
 
@@ -61,7 +61,7 @@ wait_on_mask(unsigned tagMask)
 }
 
 
-static void
+static INLINE void
 wait_on_mask_all(unsigned tagMask)
 {
    mfc_write_tag_mask( tagMask );
@@ -192,206 +192,6 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 }
 
 
-/**
- * Given a rendering command's bounding box (in pixels) compute the
- * location of the corresponding screen tile bounding box.
- */
-static INLINE void
-tile_bounding_box(const struct cell_command_render *render,
-                  uint *txmin, uint *tymin,
-                  uint *box_num_tiles, uint *box_width_tiles)
-{
-#if 0
-   /* Debug: full-window bounding box */
-   uint txmax = spu.fb.width_tiles - 1;
-   uint tymax = spu.fb.height_tiles - 1;
-   *txmin = 0;
-   *tymin = 0;
-   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   *box_width_tiles = spu.fb.width_tiles;
-   (void) render;
-   (void) txmax;
-   (void) tymax;
-#else
-   uint txmax, tymax, box_height_tiles;
-
-   *txmin = (uint) render->xmin / TILE_SIZE;
-   *tymin = (uint) render->ymin / TILE_SIZE;
-   txmax = (uint) render->xmax / TILE_SIZE;
-   tymax = (uint) render->ymax / TILE_SIZE;
-   *box_width_tiles = txmax - *txmin + 1;
-   box_height_tiles = tymax - *tymin + 1;
-   *box_num_tiles = *box_width_tiles * box_height_tiles;
-#endif
-#if 0
-   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
-          render->xmin, render->ymin, render->xmax, render->ymax);
-   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
-           spu.init.id, *txmin, *tymin, txmax, tymax);
-   ASSERT(render->xmin <= render->xmax);
-   ASSERT(render->ymin <= render->ymax);
-#endif
-}
-
-
-/** Check if the tile at (tx,ty) belongs to this SPU */
-static INLINE boolean
-my_tile(uint tx, uint ty)
-{
-   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
-}
-
-
-/**
- * Render primitives
- * \param pos_incr  returns value indicating how may words to skip after
- *                  this command in the batch buffer
- */
-static void
-cmd_render(const struct cell_command_render *render, uint *pos_incr)
-{
-   /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
-   const uint vertex_size = render->vertex_size; /* in bytes */
-   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
-   const ubyte *vertices;
-   const ushort *indexes;
-   uint i, j;
-
-
-   if (Debug) {
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-   }
-
-   ASSERT(sizeof(*render) % 4 == 0);
-   ASSERT(total_vertex_bytes % 16 == 0);
-
-   /* indexes are right after the render command in the batch buffer */
-   indexes = (const ushort *) (render + 1);
-   *pos_incr = (render->num_indexes * 2 + 3) / 4;
-
-
-   if (render->inline_verts) {
-      /* Vertices are right after indexes in batch buffer */
-      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-      *pos_incr = *pos_incr + total_vertex_bytes / 4;
-   }
-   else {
-      /* Begin DMA fetch of vertex buffer */
-      ubyte *src = spu.init.buffers[render->vertex_buf];
-      ubyte *dest = vertex_data;
-
-      /* skip vertex data we won't use */
-#if 01
-      src += render->min_index * vertex_size;
-      dest += render->min_index * vertex_size;
-      total_vertex_bytes -= render->min_index * vertex_size;
-#endif
-      ASSERT(total_vertex_bytes % 16 == 0);
-      ASSERT_ALIGN16(dest);
-      ASSERT_ALIGN16(src);
-
-      mfc_get(dest,   /* in vertex_data[] array */
-              (unsigned int) src,  /* src in main memory */
-              total_vertex_bytes,  /* size */
-              TAG_VERTEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-
-      vertices = vertex_data;
-
-      wait_on_mask(1 << TAG_VERTEX_BUFFER);
-   }
-
-
-   /**
-    ** find tiles which intersect the prim bounding box
-    **/
-   uint txmin, tymin, box_width_tiles, box_num_tiles;
-   tile_bounding_box(render, &txmin, &tymin,
-                     &box_num_tiles, &box_width_tiles);
-
-
-   /* make sure any pending clears have completed */
-   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
-
-
-   /**
-    ** loop over tiles, rendering tris
-    **/
-   for (i = 0; i < box_num_tiles; i++) {
-      const uint tx = txmin + i % box_width_tiles;
-      const uint ty = tymin + i / box_width_tiles;
-
-      ASSERT(tx < spu.fb.width_tiles);
-      ASSERT(ty < spu.fb.height_tiles);
-
-      if (!my_tile(tx, ty))
-         continue;
-
-      /* Start fetching color/z tiles.  We'll wait for completion when
-       * we need read/write to them later in triangle rasterization.
-       */
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
-            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         }
-      }
-
-      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      }
-
-      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
-      ASSERT(render->num_indexes % 3 == 0);
-
-      /* loop over tris */
-      for (j = 0; j < render->num_indexes; j += 3) {
-         const float *v0, *v1, *v2;
-
-         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
-         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
-         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
-
-         tri_draw(v0, v1, v2, tx, ty);
-      }
-
-      /* write color/z tiles back to main framebuffer, if dirtied */
-      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
-         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-         tile_status[ty][tx] = TILE_STATUS_DEFINED;
-      }
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
-            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
-         }
-      }
-
-      /* XXX move these... */
-      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-      if (spu.depth_stencil.depth.enabled) {
-         wait_on_mask(1 << TAG_WRITE_TILE_Z);
-      }
-   }
-
-   if (Debug)
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
-}
-
-
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 480c54ebd0..009e046ba5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -70,6 +70,7 @@ struct spu_global
 
 
 extern struct spu_global spu;
+extern boolean Debug;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
new file mode 100644
index 0000000000..21a286a23d
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -0,0 +1,240 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "pipe/cell/common.h"
+
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+
+
+   if (Debug) {
+      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
+             "inline_vert=%u\n",
+             spu.init.id,
+             render->prim_type,
+             render->num_verts,
+             render->num_indexes,
+             render->inline_verts);
+
+      /*
+      printf("       bound: %g, %g .. %g, %g\n",
+             render->xmin, render->ymin, render->xmax, render->ymax);
+      */
+   }
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   *pos_incr = (render->num_indexes * 2 + 3) / 4;
+
+
+   if (render->inline_verts) {
+      /* Vertices are right after indexes in batch buffer */
+      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
+      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      /* Start fetching color/z tiles.  We'll wait for completion when
+       * we need read/write to them later in triangle rasterization.
+       */
+      if (spu.depth_stencil.depth.enabled) {
+         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+         }
+      }
+
+      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+      }
+
+      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+      ASSERT(render->num_indexes % 3 == 0);
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         tri_draw(v0, v1, v2, tx, ty);
+      }
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
+         tile_status[ty][tx] = TILE_STATUS_DEFINED;
+      }
+      if (spu.depth_stencil.depth.enabled) {
+         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
+            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+         }
+      }
+
+      /* XXX move these... */
+      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+      if (spu.depth_stencil.depth.enabled) {
+         wait_on_mask(1 << TAG_WRITE_TILE_Z);
+      }
+   }
+
+   if (Debug)
+      printf("SPU %u: RENDER done\n",
+             spu.init.id);
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_render.h b/src/mesa/pipe/cell/spu/spu_render.h
new file mode 100644
index 0000000000..fbcdc5ec31
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "pipe/cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
-- 
cgit v1.2.3


From 1c65928d8400a350993687d7039e5e47371ae8b8 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:17:55 -0700
Subject: Cell: add OPT_FLAGS var

---
 configs/linux-cell | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/configs/linux-cell b/configs/linux-cell
index 4f0086cc1f..3d874491e4 100644
--- a/configs/linux-cell
+++ b/configs/linux-cell
@@ -10,11 +10,13 @@ CC = ppu32-gcc
 CXX = ppu32-g++
 HOST_CC = gcc
 
+OPT_FLAGS = -g
+
 # Cell SDK location
 SDK = /opt/ibm/cell-sdk/prototype/sysroot/usr
 
 
-CFLAGS = -g -Wall -Winline -fPIC -m32 -mabi=altivec -maltivec -I. -I$(SDK)/include -DGALLIUM_CELL
+CFLAGS = $(OPT_FLAGS) -Wall -Winline -fPIC -m32 -mabi=altivec -maltivec -I. -I$(SDK)/include -DGALLIUM_CELL
 
 CXXFLAGS = $(CFLAGS)
 
@@ -34,7 +36,7 @@ GL_LIB_DEPS = $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread \
 
 SPU_CC = spu-gcc
 
-SPU_CFLAGS = -g -W -Wall -Winline -Wmissing-prototypes -Wno-main -I. -I $(SDK)/spu/include -include spu_intrinsics.h -I $(TOP)/src/mesa/
+SPU_CFLAGS = $(OPT_FLAGS) -W -Wall -Winline -Wmissing-prototypes -Wno-main -I. -I $(SDK)/spu/include -include spu_intrinsics.h -I $(TOP)/src/mesa/
 
 SPU_LFLAGS = -L$(SDK)/spu/lib -Wl,-N -lmisc
 
-- 
cgit v1.2.3


From 7710b36d28859222f9b0bf03ab3d0cdf79d39c64 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 11:21:29 +0900
Subject: gallium: Add extern keyword to global.

---
 src/mesa/pipe/draw/draw_vf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 7d90f35b0f..c0fa063c52 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -195,7 +195,8 @@ struct draw_vf_format_info {
    const unsigned attrsize;
 };
 
-const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX];
+extern const struct draw_vf_format_info 
+draw_vf_format_info[DRAW_EMIT_MAX];
 
 
 #endif
-- 
cgit v1.2.3


From 8a88f5e40f75ac52d02c1afbcc7dd612904b4f78 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 18:01:17 +0900
Subject: gallium: Allow draw_vf usage to be controlled at runtime.

---
 src/mesa/pipe/draw/draw_vbuf.c | 149 ++++++++++++++++++++---------------------
 1 file changed, 71 insertions(+), 78 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 8ca225c65a..2309ed9f12 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -127,15 +127,9 @@ emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
 #if 0
-   const struct vertex_info *vinfo = vbuf->vinfo;
-
-   uint i;
-   uint count = 0;  /* for debug/sanity */
-   
-   assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
-
-//   fprintf(stderr, "emit vertex %d to %p\n", 
-//           vbuf->nr_vertices, vbuf->vertex_ptr);
+   fprintf(stderr, "emit vertex %d to %p\n", 
+           vbuf->nr_vertices, vbuf->vertex_ptr);
+#endif
 
    if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
       if(vertex->vertex_id < vbuf->nr_vertices)
@@ -148,75 +142,72 @@ emit_vertex( struct vbuf_stage *vbuf,
       
    vertex->vertex_id = vbuf->nr_vertices++;
 
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      uint j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
-      case EMIT_OMIT:
-         /* no-op */
-         break;
-      case EMIT_ALL:
-         /* just copy the whole vertex as-is to the vbuf */
-         assert(i == 0);
-         assert(j == 0);
-         memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
-         vbuf->vertex_ptr += vinfo->size;
-         count += vinfo->size;
-         break;
-      case EMIT_1F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         count++;
-         break;
-      case EMIT_1F_PSIZE:
-         *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
-         count++;
-         break;
-      case EMIT_2F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         count += 2;
-         break;
-      case EMIT_3F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         count += 3;
-         break;
-      case EMIT_4F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
-         count += 4;
-         break;
-      case EMIT_4UB:
-	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
-                                        float_to_ubyte( vertex->data[j][1] ),
-                                        float_to_ubyte( vertex->data[j][0] ),
-                                        float_to_ubyte( vertex->data[j][3] ));
-         count += 1;
-         break;
-      default:
-         assert(0);
+   if(!vbuf->vf) {
+      const struct vertex_info *vinfo = vbuf->vinfo;
+      uint i;
+      uint count = 0;  /* for debug/sanity */
+      
+      assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         uint j = vinfo->src_index[i];
+         switch (vinfo->emit[i]) {
+         case EMIT_OMIT:
+            /* no-op */
+            break;
+         case EMIT_ALL:
+            /* just copy the whole vertex as-is to the vbuf */
+            assert(i == 0);
+            assert(j == 0);
+            memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
+            vbuf->vertex_ptr += vinfo->size;
+            count += vinfo->size;
+            break;
+         case EMIT_1F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            count++;
+            break;
+         case EMIT_1F_PSIZE:
+            *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
+            count++;
+            break;
+         case EMIT_2F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            count += 2;
+            break;
+         case EMIT_3F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            count += 3;
+            break;
+         case EMIT_4F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
+            count += 4;
+            break;
+         case EMIT_4UB:
+   	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+                                           float_to_ubyte( vertex->data[j][1] ),
+                                           float_to_ubyte( vertex->data[j][0] ),
+                                           float_to_ubyte( vertex->data[j][3] ));
+            count += 1;
+            break;
+         default:
+            assert(0);
+         }
       }
+      assert(count == vinfo->size);
    }
-   assert(count == vinfo->size);
-#else
-   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
-      if(vertex->vertex_id < vbuf->nr_vertices)
-	 return;
-      else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
-	         vertex->vertex_id, vbuf->nr_vertices);
-      return;
+   else {
+      draw_vf_set_data(vbuf->vf, vertex->data);
+      draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
+   
+      vbuf->vertex_ptr += vbuf->vertex_size/4;
    }
-      
-   vertex->vertex_id = vbuf->nr_vertices++;
-
-   draw_vf_set_data(vbuf->vf, vertex->data);
-   draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
-
-   vbuf->vertex_ptr += vbuf->vertex_size/4;
-#endif
 }
 
 
@@ -229,6 +220,9 @@ vbuf_set_vf_attributes(struct vbuf_stage *vbuf )
    uint count = 0;  /* for debug/sanity */
    unsigned nr_attrs = 0;
    
+   if(!vbuf->vf)
+      return;
+   
 //   fprintf(stderr, "emit vertex %d to %p\n", 
 //           vbuf->nr_vertices, vbuf->vertex_ptr);
 
@@ -625,9 +619,8 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 
    vbuf->prim = ~0;
    
-   vbuf->vf = draw_vf_create();
-   if(!vbuf->vf)
-      vbuf_destroy(&vbuf->stage);
+   if(!GETENV("GALLIUM_NOVF"))
+      vbuf->vf = draw_vf_create();
    
    return &vbuf->stage;
 }
-- 
cgit v1.2.3


From c74f4a10f91acc4eca109c1be39fd320639bfa59 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 20:46:48 +0900
Subject: gallium: Emit constants.

---
 src/mesa/pipe/draw/draw_vf.c         | 10 +++++++--
 src/mesa/pipe/draw/draw_vf.h         | 22 ++++++++++++++++++-
 src/mesa/pipe/draw/draw_vf_generic.c | 42 +++++++++++++++++++++++++-----------
 src/mesa/pipe/draw/draw_vf_sse.c     |  4 ++++
 4 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 4fc2312ad1..958d31933b 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -182,6 +182,9 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	 vf->attr[j].insert = draw_vf_format_info[format].insert;
 	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
 	 vf->attr[j].vertoffset = offset;
+	 vf->attr[j].isconst = draw_vf_format_info[format].isconst;
+	 if(vf->attr[j].isconst)
+	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
 	 
 	 if (DBG)
 	    _mesa_printf("%d: %s, offset %d\n", i,  
@@ -240,8 +243,11 @@ void draw_vf_set_data( struct draw_vertex_fetch *vf,
    for (j = 0; j < vf->attr_count; j++) {
       a[j].inputstride = 0; /* XXX: one-vertex-max ATM */ 
       a[j].inputsize = 4;
-      a[j].do_insert = a[j].insert[4 - 1]; 
-      a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst)
+	 a[j].inputptr = a[j].data;
+      else
+	 a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
    }
 }
 
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index c0fa063c52..911ea07bdf 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -48,14 +48,30 @@ enum draw_vf_attr_format {
    DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
    DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
    DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_1F_CONST,
+   DRAW_EMIT_2F_CONST,
+   DRAW_EMIT_3F_CONST,
+   DRAW_EMIT_4F_CONST,
    DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
    DRAW_EMIT_MAX
 };
 
-struct draw_vf_attr_map {
+struct draw_vf_attr_map 
+{
+   /** Input attribute number */
    unsigned attrib;
+   
    enum draw_vf_attr_format format;
+   
    unsigned offset;
+   
+   /** 
+    * Constant data for DRAW_EMIT_*_CONST 
+    */
+   union {
+      uint8_t ub[4];
+      float f[4];
+   } data;
 };
 
 struct draw_vertex_fetch;
@@ -124,6 +140,9 @@ struct draw_vf_attr
    unsigned inputsize;
    unsigned inputstride;
    unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+   
+   boolean isconst;              /**< read from const data below */
+   uint8_t data[16];
 
    unsigned attrib;          /**< which vertex attrib (0=position, etc) */
    unsigned vertattrsize;    /**< size of the attribute in bytes */
@@ -193,6 +212,7 @@ struct draw_vf_format_info {
    const char *name;
    draw_vf_insert_func insert[4];
    const unsigned attrsize;
+   const boolean isconst;
 };
 
 extern const struct draw_vf_format_info 
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index a16eb456b7..0caa798396 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -387,62 +387,78 @@ const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] =
 {
    { "1f",
      { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
-     sizeof(float) },
+     sizeof(float), FALSE },
 
    { "2f",
      { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
-     2 * sizeof(float) },
+     2 * sizeof(float), FALSE },
 
    { "3f",
      { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
-     3 * sizeof(float) },
+     3 * sizeof(float), FALSE },
 
    { "4f",
      { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
-     4 * sizeof(float) },
+     4 * sizeof(float), FALSE },
 
    { "3f_xyw",
      { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
        insert_3f_xyw_4 },
-     3 * sizeof(float) },
+     3 * sizeof(float), FALSE },
 
    { "1ub_1f",
      { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
-     sizeof(uint8_t) },
+     sizeof(uint8_t), FALSE },
 
    { "3ub_3f_rgb",
      { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
        insert_3ub_3f_rgb_3 },
-     3 * sizeof(uint8_t) },
+     3 * sizeof(uint8_t), FALSE },
 
    { "3ub_3f_bgr",
      { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
        insert_3ub_3f_bgr_3 },
-     3 * sizeof(uint8_t) },
+     3 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_rgba",
      { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
        insert_4ub_4f_rgba_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_bgra",
      { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
        insert_4ub_4f_bgra_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_argb",
      { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
        insert_4ub_4f_argb_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_abgr",
      { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
        insert_4ub_4f_abgr_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "1f_const",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), TRUE },
+   
+   { "2f_const",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), TRUE },
+   
+   { "3f_const",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), TRUE },
+   
+   { "4f_const",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), TRUE },
 
    { "pad",
      { NULL, NULL, NULL, NULL },
-     0 }
+     0, FALSE },
 
 };
 
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 4036ded1d8..1389e6cfb9 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -388,18 +388,21 @@ static boolean build_vertex_emit( struct x86_program *p )
        */
       switch (a->format) {
       case DRAW_EMIT_1F:
+      case DRAW_EMIT_1F_CONST:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 1, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case DRAW_EMIT_2F:
+      case DRAW_EMIT_2F_CONST:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case DRAW_EMIT_3F:
+      case DRAW_EMIT_3F_CONST:
 	 /* Potentially the worst case - hardcode 2+1 copying:
 	  */
 	 if (0) {
@@ -423,6 +426,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 }
 	 break;
       case DRAW_EMIT_4F:
+      case DRAW_EMIT_4F_CONST:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 4, temp);
-- 
cgit v1.2.3


From 2da0724e99785c2bf854fc8a7ba40765b0563088 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 20:47:30 +0900
Subject: gallium: Emit point size as a constant.

---
 src/mesa/pipe/draw/draw_vbuf.c | 85 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 2309ed9f12..92a8b9fbcf 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -115,6 +115,70 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
 }
 
 
+#if 0
+static INLINE void
+dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
+{
+   assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+   unsigned i, j, k;
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         fprintf(stderr, "EMIT_OMIT:");
+         break;
+      case EMIT_ALL:
+         assert(i == 0);
+         assert(j == 0);
+         fprintf(stderr, "EMIT_ALL:\t");
+         for(k = 0; k < vinfo->size*4; ++k)
+            fprintf(stderr, "%02x ", *data++);
+         break;
+      case EMIT_1F:
+         fprintf(stderr, "EMIT_1F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_1F_PSIZE:
+         fprintf(stderr, "EMIT_1F_PSIZE:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_2F:
+         fprintf(stderr, "EMIT_2F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_3F:
+         fprintf(stderr, "EMIT_3F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
+         break;
+      case EMIT_4F:
+         fprintf(stderr, "EMIT_4F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_4UB:
+         fprintf(stderr, "EMIT_4UB:\t");
+         fprintf(stderr, "%u ", *data++);
+         fprintf(stderr, "%u ", *data++);
+         fprintf(stderr, "%u ", *data++);
+         fprintf(stderr, "%u ", *data++);
+         break;
+      default:
+         assert(0);
+      }
+      fprintf(stderr, "\n");
+   }
+   fprintf(stderr, "\n");
+}
+#endif
+
+
 /**
  * Extract the needed fields from post-transformed vertex and emit
  * a hardware(driver) vertex.
@@ -190,7 +254,7 @@ emit_vertex( struct vbuf_stage *vbuf,
             count += 4;
             break;
          case EMIT_4UB:
-   	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+            *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
                                            float_to_ubyte( vertex->data[j][1] ),
                                            float_to_ubyte( vertex->data[j][0] ),
                                            float_to_ubyte( vertex->data[j][3] ));
@@ -201,6 +265,20 @@ emit_vertex( struct vbuf_stage *vbuf,
          }
       }
       assert(count == vinfo->size);
+#if 0
+      {
+	 static float data[256]; 
+	 draw_vf_set_data(vbuf->vf, vertex->data);
+	 draw_vf_emit_vertices(vbuf->vf, 1, data);
+	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
+            fprintf(stderr, "With VF:\n");
+            dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
+	    fprintf(stderr, "Without VF:\n");
+	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
+	    assert(0);
+	 }
+      }
+#endif
    }
    else {
       draw_vf_set_data(vbuf->vf, vertex->data);
@@ -297,11 +375,10 @@ vbuf_set_vf_attributes(struct vbuf_stage *vbuf )
          count++;
          break;
       case EMIT_1F_PSIZE:
-	 /* FIXME */
-	 assert(0);
 	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
 	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = vbuf->stage.draw->rasterizer->point_size;
 	 nr_attrs++;
          count++;
          break;
-- 
cgit v1.2.3


From 88469bf544b62fb0786d88383901914140afa56c Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 29 Jan 2008 12:37:07 +0000
Subject: gallium: don't rely on assert(0) for error handling - may be disabled

---
 src/mesa/state_tracker/st_draw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 8ef50ee768..c9b8e78485 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -298,6 +298,7 @@ st_draw_vbo(GLcontext *ctx,
          break;
       default:
          assert(0);
+	 return;
       }
 
       /* get/create the index buffer object */
@@ -570,6 +571,7 @@ st_feedback_draw_vbo(GLcontext *ctx,
          break;
       default:
          assert(0);
+	 return;
       }
 
       map = pipe->winsys->buffer_map(pipe->winsys,
-- 
cgit v1.2.3


From d7d3c752368a236dd4755b00175d0e13212fac47 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 29 Jan 2008 12:37:47 +0000
Subject: gallium: streamline various unfilled & stippled paths

---
 src/mesa/pipe/draw/draw_prim.c | 158 +++++++++++++++++++++++++++++------------
 1 file changed, 113 insertions(+), 45 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 2a612a1673..41b3fddcc1 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -69,28 +69,46 @@ static void draw_prim_queue_flush( struct draw_context *draw )
     * draw->pipeline->first is often changed by the first call to tri(),
     * line(), etc.
     */
-   switch (draw->reduced_prim) {
-   case RP_TRI:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+   if (draw->rasterizer->line_stipple_enable) {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_POINT:
+	 draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_LINE:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+   }
+   else {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_POINT:
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_POINT:
-      draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-      for (i = 0; i < draw->pq.queue_nr; i++)
-	 draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
-      break;
    }
 
    draw->pq.queue_nr = 0;   
@@ -231,7 +249,7 @@ static void do_ef_triangle( struct draw_context *draw,
 }
 
 
-static void do_quad( struct draw_context *draw,
+static void do_ef_quad( struct draw_context *draw,
 		     unsigned v0,
 		     unsigned v1,
 		     unsigned v2,
@@ -243,6 +261,16 @@ static void do_quad( struct draw_context *draw,
    do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
 }
 
+static void do_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   do_triangle( draw, v0, v1, v3 );
+   do_triangle( draw, v1, v2, v3 );
+}
+
 
 /**
  * Main entrypoint to draw some number of points/lines/triangles
@@ -252,6 +280,8 @@ draw_prim( struct draw_context *draw,
 	   unsigned prim, unsigned start, unsigned count )
 {
    unsigned i;
+   boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
 
 //   _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
 
@@ -289,24 +319,32 @@ draw_prim( struct draw_context *draw,
       break;
 
    case PIPE_PRIM_LINE_STRIP:
-      if (count >= 2) {
-	 for (i = 1; i < count; i++) {
-	    do_line( draw,
-		     i == 1,
-		     start + i - 1,
-		     start + i );
-	 }
+      for (i = 1; i < count; i++) {
+	 do_line( draw,
+		  i == 1,
+		  start + i - 1,
+		  start + i );
       }
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 0; i+2 < count; i += 3) {
-	 do_ef_triangle( draw,
-			 1, 
-			 ~0,
+      if (unfilled) {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_ef_triangle( draw,
+			    1, 
+			    ~0,
+			    start + i + 0,
+			    start + i + 1,
+			    start + i + 2 );
+	 }
+      } 
+      else {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_triangle( draw,
 			 start + i + 0,
 			 start + i + 1,
 			 start + i + 2 );
+	 }
       }
       break;
 
@@ -340,27 +378,49 @@ draw_prim( struct draw_context *draw,
 
 
    case PIPE_PRIM_QUADS:
-      for (i = 0; i+3 < count; i += 4) {
-	 do_quad( draw,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 2,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_ef_quad( draw,
+			start + i + 0,
+			start + i + 1,
+			start + i + 2,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_quad( draw,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 2,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 0; i+3 < count; i += 2) {
-	 do_quad( draw,
-		  start + i + 2,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_ef_quad( draw,
+			start + i + 2,
+			start + i + 0,
+			start + i + 1,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_quad( draw,
+		     start + i + 2,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_POLYGON:
-      if (count >= 3) {
+      if (unfilled) {
 	 unsigned ef_mask = (1<<2) | (1<<0);
 
 	 for (i = 0; i+2 < count; i++) {
@@ -378,6 +438,14 @@ draw_prim( struct draw_context *draw,
 	    ef_mask &= ~(1<<2);
 	 }
       }
+      else {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 2,
+			 start + 0);
+	 }
+      }
       break;
 
    default:
-- 
cgit v1.2.3


From b63f994ec7742da53b4c32ff7ee8219bbd72c2ef Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 29 Jan 2008 15:17:56 +0000
Subject: gallium: weaken assert slightly

---
 src/mesa/pipe/draw/draw_vf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 958d31933b..06b84b93cc 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -174,7 +174,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
       }
       else {
-	 assert(vf->lookup[map[i].attrib] == 0);
+	 assert(vf->lookup[map[i].attrib] == 0 || format == DRAW_EMIT_1F_CONST);
 	 vf->lookup[map[i].attrib] = &vf->attr[j];
 
 	 vf->attr[j].attrib = map[i].attrib;
-- 
cgit v1.2.3


From 2b47b5b413a6511ed45ce1e44a88822c35b084ee Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 29 Jan 2008 11:22:57 -0700
Subject: Cell: use _pack_rgba8() from pack_rgba8.h to do float[4]->uint color
 conversion

texcyl.c is twice as fast now in non-texture mode
---
 src/mesa/pipe/cell/spu/spu_tri.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 19a231d9c4..7c6a54134f 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -29,6 +29,8 @@
  * Triangle rendering within a tile.
  */
 
+#include <pack_rgba8.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
@@ -38,7 +40,6 @@
 #include "spu_tri.h"
 
 
-
 /**
  * Simplified types taken from other parts of Gallium
  */
@@ -252,19 +253,11 @@ eval_z( struct setup_stage *setup,
 static INLINE uint
 pack_color(const float color[4])
 {
-   uint r = (uint) (color[0] * 255.0);
-   uint g = (uint) (color[1] * 255.0);
-   uint b = (uint) (color[2] * 255.0);
-   uint a = (uint) (color[3] * 255.0);
-   r = MIN2(r, 255);
-   g = MIN2(g, 255);
-   b = MIN2(b, 255);
-   a = MIN2(a, 255);
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return (a << 24) | (r << 16) | (g << 8) | b;
+      return _pack_rgba8(color[3], color[0], color[1], color[2]);
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return (b << 24) | (g << 16) | (r << 8) | a;
+      return _pack_rgba8(color[2], color[1], color[0], color[3]);
    default:
       ASSERT(0);
       return 0;
-- 
cgit v1.2.3


From a5273f0fac01f5864a1cfcb82d9302dd755375e9 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Wed, 30 Jan 2008 15:34:02 +1100
Subject: nouveau: 0xdeadc0de

---
 src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c   | 23 ----------------------
 .../drivers/dri/nouveau_winsys/nouveau_drmif.h     |  3 ---
 2 files changed, 26 deletions(-)

diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c b/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c
index 288674f231..4c235845b7 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c
@@ -246,29 +246,6 @@ nouveau_bo_ref(struct nouveau_device *dev, uint64_t handle,
 	return 0;
 }
 
-int
-nouveau_bo_resize(struct nouveau_bo *bo, int size)
-{
-	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
-	int ret;
-
-	if (!nvbo || nvbo->user)
-		return -EINVAL;
-
-	if (nvbo->sysmem) {
-		nvbo->sysmem = realloc(nvbo->sysmem, size);
-		if (!nvbo->sysmem)
-			return -ENOMEM;
-	} else {
-		ret = nouveau_bo_realloc_gpu(nvbo, 0, size);
-		if (ret)
-			return ret;
-	}
-
-	nvbo->base.size = size;
-	return 0;
-}
-
 void
 nouveau_bo_del(struct nouveau_bo **bo)
 {
diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h b/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h
index 3e886869d8..7ea4c65465 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h
@@ -274,9 +274,6 @@ nouveau_bo_user(struct nouveau_device *, void *ptr, int size,
 extern int
 nouveau_bo_ref(struct nouveau_device *, uint64_t handle, struct nouveau_bo **);
 
-extern int
-nouveau_bo_resize(struct nouveau_bo *, int size);
-
 extern void
 nouveau_bo_del(struct nouveau_bo **);
 
-- 
cgit v1.2.3


From fc36399f232942b3ff3975aac9e685d5f1363816 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Tue, 29 Jan 2008 16:41:10 +0100
Subject: gallium: Fix build on WinXP.

---
 src/mesa/pipe/draw/draw_clip.c              |  2 +-
 src/mesa/pipe/draw/draw_vertex_fetch.c      | 16 ++++++++--------
 src/mesa/pipe/draw/draw_vertex_shader.c     |  4 ++--
 src/mesa/pipe/draw/draw_vf.c                | 10 ++++++----
 src/mesa/pipe/draw/draw_vf_generic.c        |  2 --
 src/mesa/pipe/pipebuffer/pb_buffer_fenced.c |  4 ++--
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_clip.c b/src/mesa/pipe/draw/draw_clip.c
index da20028904..61130c5600 100644
--- a/src/mesa/pipe/draw/draw_clip.c
+++ b/src/mesa/pipe/draw/draw_clip.c
@@ -406,7 +406,7 @@ clip_init_state( struct draw_stage *stage )
 {
    struct clipper *clipper = clipper_stage( stage );
 
-   clipper->flat = stage->draw->rasterizer->flatshade;
+   clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
 
    if (clipper->flat) {
       const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 89e4c256a7..b23f487e74 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -326,6 +326,10 @@ static void fetch_xyz_rgb( struct draw_context *draw,
 			   const unsigned *elts,
 			   unsigned count )
 {
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
    assert(count <= 4);
 
 //   _mesa_printf("%s\n", __FUNCTION__);
@@ -333,10 +337,6 @@ static void fetch_xyz_rgb( struct draw_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
 
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
    for (i = 0; i < 4; i++) {
       {
 	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
@@ -366,15 +366,15 @@ static void fetch_xyz_rgb_st( struct draw_context *draw,
 			      const unsigned *elts,
 			      unsigned count )
 {
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
    assert(count <= 4);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
 
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
    for (i = 0; i < 4; i++) {
       {
 	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 0806e23d6c..b851da845f 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -199,7 +199,7 @@ run_vertex_program(struct draw_context *draw,
 void
 draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
-   unsigned i, j;
+   unsigned i;
 
    assert(draw->vs.queue_nr != 0);
 
@@ -219,7 +219,7 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
    for (i = 0; i < draw->vs.queue_nr; i += 4) {
       struct vertex_header *dests[4];
       unsigned elts[4];
-      int n = MIN2(4, draw->vs.queue_nr - i);
+      int j, n = MIN2(4, draw->vs.queue_nr - i);
 
       for (j = 0; j < n; j++) {
          elts[j] = draw->vs.queue[i + j].elt;
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 06b84b93cc..0debea1f12 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -32,7 +32,7 @@
 #include "draw_vf.h"
 
 
-#define DBG 0
+#define DRAW_VF_DBG 0
 
 
 /* TODO: remove this */
@@ -166,9 +166,10 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
    for (j = 0, i = 0; i < nr; i++) {
       const unsigned format = map[i].format;
       if (format == DRAW_EMIT_PAD) {
-	 if (DBG)
+#if (DRAW_VF_DBG)
 	    _mesa_printf("%d: pad %d, offset %d\n", i,  
 			 map[i].offset, offset);  
+#endif
 
 	 offset += map[i].offset;
 
@@ -186,10 +187,11 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	 if(vf->attr[j].isconst)
 	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
 	 
-	 if (DBG)
+#if (DRAW_VF_DBG)
 	    _mesa_printf("%d: %s, offset %d\n", i,  
 			 draw_vf_format_info[format].name,
 			 vf->attr[j].vertoffset);   
+#endif
 
 	 offset += draw_vf_format_info[format].attrsize;
 	 j++;
@@ -303,7 +305,7 @@ void draw_vf_destroy( struct draw_vertex_fetch *vf )
        * to unify them, but this probably won't change until this
        * module gets another overhaul.
        */
-      _mesa_exec_free((void *) fp->func);
+      //_mesa_exec_free((void *) fp->func);
       FREE(fp);
    }
    
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 0caa798396..7f5f56ef9c 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -29,8 +29,6 @@
 
 #include <assert.h>
 
-#include "simple_list.h"
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_util.h"
 
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
index 349647fe6e..4cf4222db9 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
@@ -145,7 +145,7 @@ _fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list,
       /* Do the delayed destroy:
        */
       pb_reference(&fenced_buf->buffer, NULL);
-      free(fenced_buf);
+      FREE(fenced_buf);
    }
 }
 
@@ -162,7 +162,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    }
    else {
       pb_reference(&fenced_buf->buffer, NULL);
-      free(fenced_buf);
+      FREE(fenced_buf);
    }
    
    if ((fenced_list->numDelayed % fenced_list->checkDelayed) == 0)
-- 
cgit v1.2.3


From cdb48e20d64b8dedcda2ee7f0636db223efef0fa Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 30 Jan 2008 15:24:56 +0900
Subject: gallium: Remove draw_vertex_fetch::lookup.

It is not being used, and would be dangerous to use given the possibility of constants.
---
 src/mesa/pipe/draw/draw_vf.c | 5 -----
 src/mesa/pipe/draw/draw_vf.h | 2 --
 2 files changed, 7 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 0debea1f12..64d9ed02a9 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -161,8 +161,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
    assert(nr < PIPE_ATTRIB_MAX);
 
-   memset(vf->lookup, 0, sizeof(vf->lookup));
-
    for (j = 0, i = 0; i < nr; i++) {
       const unsigned format = map[i].format;
       if (format == DRAW_EMIT_PAD) {
@@ -175,9 +173,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
       }
       else {
-	 assert(vf->lookup[map[i].attrib] == 0 || format == DRAW_EMIT_1F_CONST);
-	 vf->lookup[map[i].attrib] = &vf->attr[j];
-
 	 vf->attr[j].attrib = map[i].attrib;
 	 vf->attr[j].format = format;
 	 vf->attr[j].insert = draw_vf_format_info[format].insert;
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 911ea07bdf..09cf4d3a6a 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -159,8 +159,6 @@ struct draw_vertex_fetch
    unsigned attr_count;
    unsigned vertex_stride;
 
-   struct draw_vf_attr *lookup[PIPE_ATTRIB_MAX];
-   
    draw_vf_emit_func emit;
 
    /* Parameters and constants for codegen:
-- 
cgit v1.2.3


From ee41d7afc922083de46cbdc491ee6052f3c4d45b Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 30 Jan 2008 16:46:41 +0900
Subject: gallium: Teach draw_vf about draw vertices.

This reduces the emit overhead, which is significant since we're
emiting one vertex at a time.
---
 src/mesa/pipe/draw/draw_vbuf.c | 147 ++---------------------------------
 src/mesa/pipe/draw/draw_vf.c   | 171 ++++++++++++++++++++++++++++++++++-------
 src/mesa/pipe/draw/draw_vf.h   |  17 ++--
 3 files changed, 161 insertions(+), 174 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 92a8b9fbcf..ac03001d8f 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -35,7 +35,6 @@
 
 
 #include <assert.h>
-#include <stddef.h>
 
 #include "pipe/p_util.h"
 
@@ -268,8 +267,7 @@ emit_vertex( struct vbuf_stage *vbuf,
 #if 0
       {
 	 static float data[256]; 
-	 draw_vf_set_data(vbuf->vf, vertex->data);
-	 draw_vf_emit_vertices(vbuf->vf, 1, data);
+	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
 	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
             fprintf(stderr, "With VF:\n");
             dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
@@ -281,149 +279,13 @@ emit_vertex( struct vbuf_stage *vbuf,
 #endif
    }
    else {
-      draw_vf_set_data(vbuf->vf, vertex->data);
-      draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
+      draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
    
       vbuf->vertex_ptr += vbuf->vertex_size/4;
    }
 }
 
 
-static void
-vbuf_set_vf_attributes(struct vbuf_stage *vbuf ) 
-{
-   const struct vertex_info *vinfo = vbuf->vinfo;
-   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
-   uint i;
-   uint count = 0;  /* for debug/sanity */
-   unsigned nr_attrs = 0;
-   
-   if(!vbuf->vf)
-      return;
-   
-//   fprintf(stderr, "emit vertex %d to %p\n", 
-//           vbuf->nr_vertices, vbuf->vertex_ptr);
-
-#if 0
-   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
-      if(vertex->vertex_id < vbuf->nr_vertices)
-	 return;
-      else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
-	         vertex->vertex_id, vbuf->nr_vertices);
-      return;
-   }
-#endif
-   
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      uint j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
-      case EMIT_OMIT:
-         /* no-op */
-         break;
-      case EMIT_ALL: {
-         /* just copy the whole vertex as-is to the vbuf */
-	 unsigned k, s = vinfo->size;
-         assert(i == 0);
-         assert(j == 0);
-         /* copy the vertex header */
-         /* XXX: we actually don't copy the header, just pad it */
-	 attrs[nr_attrs].attrib = 0;
-	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
-	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
-	 s -= offsetof(struct vertex_header, data)/4;
-         count += offsetof(struct vertex_header, data)/4;
-	 nr_attrs++;
-	 /* copy the vertex data */
-         for(k = 0; k < (s & ~0x3); k += 4) {
-      	    attrs[nr_attrs].attrib = k/4;
-      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
-      	    attrs[nr_attrs].offset = 0;
-      	    nr_attrs++;
-            count += 4;
-         }
-         /* tail */
-         /* XXX: actually, this shouldn't be needed */
- 	 attrs[nr_attrs].attrib = k/4;
-  	 attrs[nr_attrs].offset = 0;
-         switch(s & 0x3) {
-         case 0:
-            break;
-         case 1:
-      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
-      	    nr_attrs++;
-            count += 1;
-            break;
-         case 2:
-      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
-      	    nr_attrs++;
-            count += 2;
-            break;
-         case 3:
-      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
-      	    nr_attrs++;
-            count += 3;
-            break;
-         }
-         break;
-      }
-      case EMIT_1F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_1F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count++;
-         break;
-      case EMIT_1F_PSIZE:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
-	 attrs[nr_attrs].offset = 0;
-	 attrs[nr_attrs].data.f[0] = vbuf->stage.draw->rasterizer->point_size;
-	 nr_attrs++;
-         count++;
-         break;
-      case EMIT_2F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_2F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 2;
-         break;
-      case EMIT_3F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_3F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 3;
-         break;
-      case EMIT_4F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_4F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 4;
-         break;
-      case EMIT_4UB:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 1;
-         break;
-      default:
-         assert(0);
-      }
-   }
-   
-   assert(count == vinfo->size);  
-   
-   draw_vf_set_vertex_attributes(vbuf->vf, 
-                                 attrs, 
-                                 nr_attrs, 
-                                 vbuf->vertex_size);
-}
-
-
 static void 
 vbuf_tri( struct draw_stage *stage,
           struct prim_header *prim )
@@ -498,7 +360,10 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
 
    vbuf->vinfo = vinfo;
    vbuf->vertex_size = vertex_size;
-   vbuf_set_vf_attributes(vbuf);
+   if(vbuf->vf)
+      draw_vf_set_vertex_info(vbuf->vf, 
+                              vbuf->vinfo,
+                              vbuf->stage.draw->rasterizer->point_size);
    
    if (!vbuf->vertices)
       vbuf_alloc_vertices(vbuf);
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 64d9ed02a9..0da8e59ad6 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -26,6 +26,8 @@
  */
 
 
+#include <stddef.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_util.h"
 
@@ -151,10 +153,11 @@ static void choose_emit_func( struct draw_vertex_fetch *vf,
 
 
-unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
-				 const struct draw_vf_attr_map *map,
-				 unsigned nr, 
-				 unsigned vertex_stride )
+static unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride )
 {
    unsigned offset = 0;
    unsigned i, j;
@@ -202,6 +205,133 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 }
 
 
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size )
+{
+   unsigned i, j, k;
+   struct draw_vf_attr *a = vf->attr;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   unsigned count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
+	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = point_size;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vinfo->size * sizeof(float) );
+
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst) {
+	 a[j].inputptr = a[j].data;
+	 a[j].inputstride = 0;
+      }
+   }
+}
+
 
 #if 0
 /* Set attribute pointers, adjusted for start position:
@@ -229,38 +359,27 @@ void draw_vf_set_sources( struct draw_vertex_fetch *vf,
 #endif
 
 
-/* Set attribute pointers, adjusted for start position:
+/**
+ * Emit a vertex to dest.  
  */
-void draw_vf_set_data( struct draw_vertex_fetch *vf,
-                       float data[][4])
+void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                          struct vertex_header *vertex,
+                          void *dest )
 {
    struct draw_vf_attr *a = vf->attr;
    unsigned j;
    
    for (j = 0; j < vf->attr_count; j++) {
-      a[j].inputstride = 0; /* XXX: one-vertex-max ATM */ 
-      a[j].inputsize = 4;
-      a[j].do_insert = a[j].insert[4 - 1];
-      if(a[j].isconst)
-	 a[j].inputptr = a[j].data;
-      else
-	 a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
+      if(!a[j].isconst) {
+	 a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
+	 a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
+      }
    }
+   
+   vf->emit( vf, 1, (uint8_t*) dest );
 }
 
 
-/* Emit count VB vertices to dest.  
- */
-void draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
-		       unsigned count,
-		       void *dest )
-{
-   vf->emit( vf, count, (uint8_t*) dest );	
-}
-
-
-
-
 
 struct draw_vertex_fetch *draw_vf_create( void )
 {
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 09cf4d3a6a..e694b98675 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -33,6 +33,7 @@
 #include "pipe/p_state.h"
 
 #include "draw_vertex.h"
+#include "draw_private.h" // for vertex_header
 
 
 enum draw_vf_attr_format {
@@ -78,11 +79,17 @@ struct draw_vertex_fetch;
 
 
+#if 0
 unsigned 
 draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
                                const struct draw_vf_attr_map *map,
                                unsigned nr, 
                                unsigned vertex_stride );
+#endif
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size );
 
 #if 0
 void 
@@ -92,13 +99,9 @@ draw_vf_set_sources( struct draw_vertex_fetch *vf,
 #endif
 
 void 
-draw_vf_set_data( struct draw_vertex_fetch *vf,
-                  float data[][4]);
-
-void 
-draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
-		       unsigned count,
-		       void *dest );
+draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                     struct vertex_header *vertex,
+                     void *dest );
 
 struct draw_vertex_fetch *
 draw_vf_create( void );
-- 
cgit v1.2.3


From 92d71f073006e05ef68e02dff92ae7ff40cfc470 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:49:26 -0700
Subject: Cell: move CELL_MAX_SPUS

---
 src/mesa/pipe/cell/common.h           | 2 ++
 src/mesa/pipe/cell/ppu/cell_context.h | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 90aa46a534..d5e86863d4 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -61,6 +61,8 @@
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
 
+#define CELL_MAX_SPUS 6
+
 #define TILE_SIZE 32
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 7d234f3e45..65b89518ad 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -38,9 +38,6 @@
 #include "pipe/cell/common.h"
 
 
-#define CELL_MAX_SPUS 6
-
-
 struct cell_vbuf_render;
 
 struct cell_vertex_shader_state
-- 
cgit v1.2.3


From ae6949659693385be2ccd4290338b58038ed8125 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:49:51 -0700
Subject: Cell: make wait_on_mask() static/inlined

---
 src/mesa/pipe/cell/spu/spu_main.c | 19 -------------------
 src/mesa/pipe/cell/spu/spu_main.h | 23 +++++++++++++++++++++--
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 6e02f2c964..6886f283be 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,7 +31,6 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <spu_mfcio.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -52,24 +51,6 @@ boolean Debug = FALSE;
 struct spu_global spu;
 
 
-void
-wait_on_mask(unsigned tagMask)
-{
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_any();
-}
-
-
-static INLINE void
-wait_on_mask_all(unsigned tagMask)
-{
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_all();
-}
-
-
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 009e046ba5..8908bf8bc0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -29,6 +29,8 @@
 #define SPU_MAIN_H
 
 
+#include <spu_mfcio.h>
+
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_vertex.h"
 #include "pipe/p_state.h"
@@ -90,8 +92,25 @@ extern boolean Debug;
 
 
-extern void
-wait_on_mask(unsigned tag);
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
 
 
 static INLINE void
-- 
cgit v1.2.3


From 69099004e62b8710bc0b360fd2938439b34c0079 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:56:14 -0700
Subject: Cell: check tile status before wait_on_mask()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 7c6a54134f..01a47a4851 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -278,7 +278,7 @@ do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else {
+   else if (tile_status_z[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
@@ -403,7 +403,7 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else {
+      else if (tile_status[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-- 
cgit v1.2.3


From ab5e8b33cb615b9267ec4d08173c3c83cfd6df3f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:56:41 -0700
Subject: Cell: minor code refactoring, movement

---
 src/mesa/pipe/cell/spu/spu_render.c | 85 ++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 30 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index 21a286a23d..f506095116 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -88,6 +88,55 @@ my_tile(uint tx, uint ty)
 }
 
 
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.depth_stencil.depth.enabled) {
+      if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+      }
+   }
+
+   if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+      get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+      put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
+      tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+   }
+
+   if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+      put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
+      tile_status[ty][tx] = TILE_STATUS_DEFINED;
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.depth_stencil.depth.enabled) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
 /**
  * Render primitives
  * \param pos_incr  returns value indicating how may words to skip after
@@ -122,6 +171,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
 
    /* indexes are right after the render command in the batch buffer */
    indexes = (const ushort *) (render + 1);
@@ -186,21 +238,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
-      /* Start fetching color/z tiles.  We'll wait for completion when
-       * we need read/write to them later in triangle rasterization.
-       */
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
-            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         }
-      }
-
-      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      }
-
-      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
-      ASSERT(render->num_indexes % 3 == 0);
+      get_cz_tiles(tx, ty);
 
       /* loop over tris */
       for (j = 0; j < render->num_indexes; j += 3) {
@@ -214,22 +252,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       }
 
       /* write color/z tiles back to main framebuffer, if dirtied */
-      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
-         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-         tile_status[ty][tx] = TILE_STATUS_DEFINED;
-      }
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
-            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
-         }
-      }
+      put_cz_tiles(tx, ty);
 
-      /* XXX move these... */
-      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-      if (spu.depth_stencil.depth.enabled) {
-         wait_on_mask(1 << TAG_WRITE_TILE_Z);
-      }
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
    }
 
    if (Debug)
-- 
cgit v1.2.3


From ecb0013e2f4157caeb1e60c01ba06d6c8957e609 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 12:08:23 -0700
Subject: Cell: make 'setup' a regular var instead of passing around a pointer
 everywhere

We'll never have more than one of these objects.
Avoiding pointer deref improves performance a bit.
---
 src/mesa/pipe/cell/spu/spu_tri.c | 419 +++++++++++++++++++--------------------
 1 file changed, 209 insertions(+), 210 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 01a47a4851..5bb2cb12e3 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -135,6 +135,12 @@ struct setup_stage {
 };
 
 
+
+static struct setup_stage setup;
+
+
+
+
 #if 0
 /**
  * Basically a cast wrapper.
@@ -147,33 +153,33 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
 
 #if 0
 /**
- * Clip setup->quad against the scissor/surface bounds.
+ * Clip setup.quad against the scissor/surface bounds.
  */
 static INLINE void
 quad_clip(struct setup_stage *setup)
 {
-   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
    const int minx = (int) cliprect->minx;
    const int maxx = (int) cliprect->maxx;
    const int miny = (int) cliprect->miny;
    const int maxy = (int) cliprect->maxy;
 
-   if (setup->quad.x0 >= maxx ||
-       setup->quad.y0 >= maxy ||
-       setup->quad.x0 + 1 < minx ||
-       setup->quad.y0 + 1 < miny) {
+   if (setup.quad.x0 >= maxx ||
+       setup.quad.y0 >= maxy ||
+       setup.quad.x0 + 1 < minx ||
+       setup.quad.y0 + 1 < miny) {
       /* totally clipped */
-      setup->quad.mask = 0x0;
+      setup.quad.mask = 0x0;
       return;
    }
-   if (setup->quad.x0 < minx)
-      setup->quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup->quad.y0 < miny)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup->quad.x0 == maxx - 1)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup->quad.y0 == maxy - 1)
-      setup->quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+   if (setup.quad.x0 < minx)
+      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (setup.quad.y0 < miny)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (setup.quad.x0 == maxx - 1)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (setup.quad.y0 == maxy - 1)
+      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
 }
 #endif
 
@@ -185,9 +191,9 @@ static INLINE void
 clip_emit_quad(struct setup_stage *setup)
 {
    quad_clip(setup);
-   if (setup->quad.mask) {
-      struct softpipe_context *sp = setup->softpipe;
-      sp->quad.first->run(sp->quad.first, &setup->quad);
+   if (setup.quad.mask) {
+      struct softpipe_context *sp = setup.softpipe;
+      sp->quad.first->run(sp->quad.first, &setup.quad);
    }
 }
 #endif
@@ -198,8 +204,7 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff( struct setup_stage *setup, uint slot,
-            float x, float y, float result[4][4])
+eval_coeff(uint slot, float x, float y, float result[4][4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
@@ -209,7 +214,7 @@ eval_coeff( struct setup_stage *setup, uint slot,
             result[QUAD_TOP_LEFT][i] =
             result[QUAD_TOP_RIGHT][i] =
             result[QUAD_BOTTOM_LEFT][i] =
-            result[QUAD_BOTTOM_RIGHT][i] = setup->coef[slot].a0[i];
+            result[QUAD_BOTTOM_RIGHT][i] = setup.coef[slot].a0[i];
          }
       }
       break;
@@ -219,12 +224,12 @@ eval_coeff( struct setup_stage *setup, uint slot,
    default:
       {
          uint i;
-         const float *dadx = setup->coef[slot].dadx;
-         const float *dady = setup->coef[slot].dady;
+         const float *dadx = setup.coef[slot].dadx;
+         const float *dady = setup.coef[slot].dady;
 
          /* loop over XYZW comps */
          for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+            result[QUAD_TOP_LEFT][i] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
             result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
             result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
             result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
@@ -235,15 +240,14 @@ eval_coeff( struct setup_stage *setup, uint slot,
 
 
 static INLINE void
-eval_z( struct setup_stage *setup,
-        float x, float y, float result[4])
+eval_z(float x, float y, float result[4])
 {
    const uint slot = 0;
    const uint i = 2;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
+   const float *dadx = setup.coef[slot].dadx;
+   const float *dady = setup.coef[slot].dady;
 
-   result[QUAD_TOP_LEFT] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+   result[QUAD_TOP_LEFT] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
    result[QUAD_TOP_RIGHT] = result[0] + dadx[i];
    result[QUAD_BOTTOM_LEFT] = result[0] + dady[i];
    result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i];
@@ -266,23 +270,23 @@ pack_color(const float color[4])
 
 
 static uint
-do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
+do_depth_test(int x, int y, unsigned mask)
 {
-   int ix = x - setup->cliprect_minx;
-   int iy = y - setup->cliprect_miny;
+   int ix = x - setup.cliprect_minx;
+   int iy = y - setup.cliprect_miny;
    float zvals[4];
 
-   eval_z(setup, (float) x, (float) y, zvals);
+   eval_z((float) x, (float) y, zvals);
 
-   if (tile_status_z[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
+   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else if (tile_status_z[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
+   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
+   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
@@ -363,31 +367,31 @@ do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
+emit_quad( int x, int y, unsigned mask )
 {
 #if 0
-   struct softpipe_context *sp = setup->softpipe;
-   setup->quad.x0 = x;
-   setup->quad.y0 = y;
-   setup->quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup->quad);
+   struct softpipe_context *sp = setup.softpipe;
+   setup.quad.x0 = x;
+   setup.quad.y0 = y;
+   setup.quad.mask = mask;
+   sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
    /* Cell: "write" quad fragments to the tile by setting prim color */
-   const int ix = x - setup->cliprect_minx;
-   const int iy = y - setup->cliprect_miny;
+   const int ix = x - setup.cliprect_minx;
+   const int iy = y - setup.cliprect_miny;
    uint colors[4];  /* indexed by QUAD_x */
 
    if (spu.texture.start) {
       float texcoords[4][4];
       uint i;
-      eval_coeff(setup, 2, (float) x, (float) y, texcoords);
+      eval_coeff(2, (float) x, (float) y, texcoords);
       for (i = 0; i < 4; i++) {
          colors[i] = sample_texture(texcoords[i]);
       }
    }
    else {
       float fcolors[4][4];
-      eval_coeff(setup, 1, (float) x, (float) y, fcolors);
+      eval_coeff(1, (float) x, (float) y, fcolors);
       colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
       colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
       colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
@@ -395,19 +399,19 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-      mask &= do_depth_test(setup, x, y, mask);
+      mask &= do_depth_test(x, y, mask);
    }
 
    if (mask) {
-      if (tile_status[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
+      if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else if (tile_status[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
+      else if (tile_status[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-      tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
+      tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
       if (mask & MASK_TOP_LEFT)
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
@@ -439,20 +443,20 @@ static INLINE int block( int x )
  * this is pretty nasty...  may need to rework flush_spans again to
  * fix it, if possible.
  */
-static unsigned calculate_mask( struct setup_stage *setup, int x )
+static unsigned calculate_mask( int x )
 {
    unsigned mask = 0x0;
 
-   if (x >= setup->span.left[0] && x < setup->span.right[0]) 
+   if (x >= setup.span.left[0] && x < setup.span.right[0]) 
       mask |= MASK_TOP_LEFT;
 
-   if (x >= setup->span.left[1] && x < setup->span.right[1]) 
+   if (x >= setup.span.left[1] && x < setup.span.right[1]) 
       mask |= MASK_BOTTOM_LEFT;
       
-   if (x+1 >= setup->span.left[0] && x+1 < setup->span.right[0]) 
+   if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) 
       mask |= MASK_TOP_RIGHT;
 
-   if (x+1 >= setup->span.left[1] && x+1 < setup->span.right[1]) 
+   if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) 
       mask |= MASK_BOTTOM_RIGHT;
 
    return mask;
@@ -462,28 +466,28 @@ static unsigned calculate_mask( struct setup_stage *setup, int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( struct setup_stage *setup )
+static void flush_spans( void )
 {
    int minleft, maxright;
    int x;
 
-   switch (setup->span.y_flags) {
+   switch (setup.span.y_flags) {
    case 0x3:
       /* both odd and even lines written (both quad rows) */
-      minleft = MIN2(setup->span.left[0], setup->span.left[1]);
-      maxright = MAX2(setup->span.right[0], setup->span.right[1]);
+      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
+      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
       break;
 
    case 0x1:
       /* only even line written (quad top row) */
-      minleft = setup->span.left[0];
-      maxright = setup->span.right[0];
+      minleft = setup.span.left[0];
+      maxright = setup.span.right[0];
       break;
 
    case 0x2:
       /* only odd line written (quad bottom row) */
-      minleft = setup->span.left[1];
-      maxright = setup->span.right[1];
+      minleft = setup.span.left[1];
+      maxright = setup.span.right[1];
       break;
 
    default:
@@ -494,31 +498,29 @@ static void flush_spans( struct setup_stage *setup )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( setup, x, setup->span.y, 
-                 calculate_mask( setup, x ) );
+      emit_quad( x, setup.span.y, 
+                 calculate_mask( x ) );
    }
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
 }
 
 #if DEBUG_VERTS
-static void print_vertex(const struct setup_stage *setup,
-                         const struct vertex_header *v)
+static void print_vertex(const struct vertex_header *v)
 {
    int i;
    fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup.quad.nr_attrs; i++) {
       fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
    }
 }
 #endif
 
-static boolean setup_sort_vertices( struct setup_stage *setup,
-				      const struct prim_header *prim )
+static boolean setup_sort_vertices(const struct prim_header *prim )
 {
    const struct vertex_header *v0 = prim->v[0];
    const struct vertex_header *v1 = prim->v[1];
@@ -526,12 +528,12 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-   print_vertex(setup, v2);
+   print_vertex(v0);
+   print_vertex(v1);
+   print_vertex(v2);
 #endif
 
-   setup->vprovoke = v2;
+   setup.vprovoke = v2;
 
    /* determine bottom to top order of vertices */
    {
@@ -541,65 +543,65 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
       if (y0 <= y1) {
 	 if (y1 <= y2) {
 	    /* y0<=y1<=y2 */
-	    setup->vmin = v0;   
-	    setup->vmid = v1;   
-	    setup->vmax = v2;
+	    setup.vmin = v0;   
+	    setup.vmid = v1;   
+	    setup.vmax = v2;
 	 }
 	 else if (y2 <= y0) {
 	    /* y2<=y0<=y1 */
-	    setup->vmin = v2;   
-	    setup->vmid = v0;   
-	    setup->vmax = v1;   
+	    setup.vmin = v2;   
+	    setup.vmid = v0;   
+	    setup.vmax = v1;   
 	 }
 	 else {
 	    /* y0<=y2<=y1 */
-	    setup->vmin = v0;   
-	    setup->vmid = v2;   
-	    setup->vmax = v1;  
+	    setup.vmin = v0;   
+	    setup.vmid = v2;   
+	    setup.vmax = v1;  
 	 }
       }
       else {
 	 if (y0 <= y2) {
 	    /* y1<=y0<=y2 */
-	    setup->vmin = v1;   
-	    setup->vmid = v0;   
-	    setup->vmax = v2;  
+	    setup.vmin = v1;   
+	    setup.vmid = v0;   
+	    setup.vmax = v2;  
 	 }
 	 else if (y2 <= y1) {
 	    /* y2<=y1<=y0 */
-	    setup->vmin = v2;   
-	    setup->vmid = v1;   
-	    setup->vmax = v0;  
+	    setup.vmin = v2;   
+	    setup.vmid = v1;   
+	    setup.vmax = v0;  
 	 }
 	 else {
 	    /* y1<=y2<=y0 */
-	    setup->vmin = v1;   
-	    setup->vmid = v2;   
-	    setup->vmax = v0;
+	    setup.vmin = v1;   
+	    setup.vmid = v2;   
+	    setup.vmax = v0;
 	 }
       }
    }
 
    /* Check if triangle is completely outside the tile bounds */
-   if (setup->vmin->data[0][1] > setup->cliprect_maxy)
+   if (setup.vmin->data[0][1] > setup.cliprect_maxy)
       return FALSE;
-   if (setup->vmax->data[0][1] < setup->cliprect_miny)
+   if (setup.vmax->data[0][1] < setup.cliprect_miny)
       return FALSE;
-   if (setup->vmin->data[0][0] < setup->cliprect_minx &&
-       setup->vmid->data[0][0] < setup->cliprect_minx &&
-       setup->vmax->data[0][0] < setup->cliprect_minx)
+   if (setup.vmin->data[0][0] < setup.cliprect_minx &&
+       setup.vmid->data[0][0] < setup.cliprect_minx &&
+       setup.vmax->data[0][0] < setup.cliprect_minx)
       return FALSE;
-   if (setup->vmin->data[0][0] > setup->cliprect_maxx &&
-       setup->vmid->data[0][0] > setup->cliprect_maxx &&
-       setup->vmax->data[0][0] > setup->cliprect_maxx)
+   if (setup.vmin->data[0][0] > setup.cliprect_maxx &&
+       setup.vmid->data[0][0] > setup.cliprect_maxx &&
+       setup.vmax->data[0][0] > setup.cliprect_maxx)
       return FALSE;
 
-   setup->ebot.dx = setup->vmid->data[0][0] - setup->vmin->data[0][0];
-   setup->ebot.dy = setup->vmid->data[0][1] - setup->vmin->data[0][1];
-   setup->emaj.dx = setup->vmax->data[0][0] - setup->vmin->data[0][0];
-   setup->emaj.dy = setup->vmax->data[0][1] - setup->vmin->data[0][1];
-   setup->etop.dx = setup->vmax->data[0][0] - setup->vmid->data[0][0];
-   setup->etop.dy = setup->vmax->data[0][1] - setup->vmid->data[0][1];
+   setup.ebot.dx = setup.vmid->data[0][0] - setup.vmin->data[0][0];
+   setup.ebot.dy = setup.vmid->data[0][1] - setup.vmin->data[0][1];
+   setup.emaj.dx = setup.vmax->data[0][0] - setup.vmin->data[0][0];
+   setup.emaj.dy = setup.vmax->data[0][1] - setup.vmin->data[0][1];
+   setup.etop.dx = setup.vmax->data[0][0] - setup.vmid->data[0][0];
+   setup.etop.dy = setup.vmax->data[0][1] - setup.vmid->data[0][1];
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
@@ -612,13 +614,13 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     * use the prim->det value because its sign is correct.
     */
    {
-      const float area = (setup->emaj.dx * setup->ebot.dy - 
-			    setup->ebot.dx * setup->emaj.dy);
+      const float area = (setup.emaj.dx * setup.ebot.dy - 
+			    setup.ebot.dx * setup.emaj.dy);
 
-      setup->oneoverarea = 1.0f / area;
+      setup.oneoverarea = 1.0f / area;
       /*
       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup->oneoverarea, area, prim->det );
+                   __FUNCTION__, setup.oneoverarea, area, prim->det );
       */
    }
 
@@ -627,7 +629,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.facing = (prim->det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 #endif
 
    return TRUE;
@@ -637,22 +639,22 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  * The value value comes from vertex->data[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
+ * The result will be put into setup.coef[slot].a0[i].
  * \param slot  which attribute slot 
  * \param i  which component of the slot (0..3)
  */
-static void const_coeff(struct setup_stage *setup, uint slot)
+static void const_coeff(uint slot)
 {
    uint i;
    ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
    for (i = 0; i < 4; i++) {
-      setup->coef[slot].dadx[i] = 0;
-      setup->coef[slot].dady[i] = 0;
+      setup.coef[slot].dadx[i] = 0;
+      setup.coef[slot].dady[i] = 0;
 
       /* need provoking vertex info!
        */
-      setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+      setup.coef[slot].a0[i] = setup.vprovoke->data[slot][i];
    }
 }
 
@@ -661,20 +663,19 @@ static void const_coeff(struct setup_stage *setup, uint slot)
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( struct setup_stage *setup,
-                              uint slot, uint firstComp, uint lastComp )
+static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
 {
    uint i;
    for (i = firstComp; i < lastComp; i++) {
-      float botda = setup->vmid->data[slot][i] - setup->vmin->data[slot][i];
-      float majda = setup->vmax->data[slot][i] - setup->vmin->data[slot][i];
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float botda = setup.vmid->data[slot][i] - setup.vmin->data[slot][i];
+      float majda = setup.vmax->data[slot][i] - setup.vmin->data[slot][i];
+      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
    
       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-      setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-      setup->coef[slot].dady[i] = b * setup->oneoverarea;
+      setup.coef[slot].dadx[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady[i] = b * setup.oneoverarea;
 
       /* calculate a0 as the value which would be sampled for the
        * fragment at (0,0), taking into account that we want to sample at
@@ -688,17 +689,17 @@ static void tri_linear_coeff( struct setup_stage *setup,
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup->coef[slot].a0[i] = (setup->vmin->data[slot][i] - 
-                                 (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-                                  setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0[i] = (setup.vmin->data[slot][i] - 
+                                 (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
+                                  setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
    }
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
-		setup->coef[slot].a0[i],
-		setup->coef[slot].dadx[i],
-		setup->coef[slot].dady[i]);
+		setup.coef[slot].a0[i],
+		setup.coef[slot].dadx[i],
+		setup.coef[slot].dady[i]);
    */
 }
 
@@ -712,46 +713,45 @@ static void tri_linear_coeff( struct setup_stage *setup,
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( struct setup_stage *setup,
-                             unsigned slot,
+static void tri_persp_coeff( unsigned slot,
                              unsigned i )
 {
    /* premultiply by 1/w:
     */
-   float mina = setup->vmin->data[slot][i] * setup->vmin->data[0][3];
-   float mida = setup->vmid->data[slot][i] * setup->vmid->data[0][3];
-   float maxa = setup->vmax->data[slot][i] * setup->vmax->data[0][3];
+   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
+   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
+   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
 
    float botda = mida - mina;
    float majda = maxa - mina;
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
       
    /*
    printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup->vmin->data[slot][i],
-          setup->vmid->data[slot][i],
-          setup->vmax->data[slot][i]
+          setup.vmin->data[slot][i],
+          setup.vmid->data[slot][i],
+          setup.vmax->data[slot][i]
           );
    */
 
    assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-   setup->coef[slot].dady[i] = b * setup->oneoverarea;
-   setup->coef[slot].a0[i] = (mina - 
-			    (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-			     setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0[i] = (mina - 
+			    (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
 }
 #endif
 
 
 /**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients( struct setup_stage *setup )
+static void setup_tri_coefficients(void)
 {
 #if 1
    uint i;
@@ -761,17 +761,17 @@ static void setup_tri_coefficients( struct setup_stage *setup )
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(setup, i, 2, 3);
+         tri_linear_coeff(i, 2, 3);
          /* XXX interp W if PERSPECTIVE... */
          break;
       case INTERP_CONSTANT:
-         const_coeff(setup, i);
+         const_coeff(i);
          break;
       case INTERP_LINEAR:
-         tri_linear_coeff(setup, i, 0, 4);
+         tri_linear_coeff(i, 0, 4);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff(setup, i, 0, 4); /* XXX temporary */
+         tri_linear_coeff(i, 0, 4); /* XXX temporary */
          break;
       default:
          ASSERT(0);
@@ -781,35 +781,35 @@ static void setup_tri_coefficients( struct setup_stage *setup )
    ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
    ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
           spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(setup, 0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(setup, 1, 0, 4);  /* slot 1, color */
+   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
+   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
 #endif
 }
 
 
-static void setup_tri_edges( struct setup_stage *setup )
+static void setup_tri_edges(void)
 {
-   float vmin_x = setup->vmin->data[0][0] + 0.5f;
-   float vmid_x = setup->vmid->data[0][0] + 0.5f;
-
-   float vmin_y = setup->vmin->data[0][1] - 0.5f;
-   float vmid_y = setup->vmid->data[0][1] - 0.5f;
-   float vmax_y = setup->vmax->data[0][1] - 0.5f;
-
-   setup->emaj.sy = CEILF(vmin_y);
-   setup->emaj.lines = (int) CEILF(vmax_y - setup->emaj.sy);
-   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
-   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
-
-   setup->etop.sy = CEILF(vmid_y);
-   setup->etop.lines = (int) CEILF(vmax_y - setup->etop.sy);
-   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
-   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
-
-   setup->ebot.sy = CEILF(vmin_y);
-   setup->ebot.lines = (int) CEILF(vmid_y - setup->ebot.sy);
-   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
-   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+   float vmin_x = setup.vmin->data[0][0] + 0.5f;
+   float vmid_x = setup.vmid->data[0][0] + 0.5f;
+
+   float vmin_y = setup.vmin->data[0][1] - 0.5f;
+   float vmid_y = setup.vmid->data[0][1] - 0.5f;
+   float vmax_y = setup.vmax->data[0][1] - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 }
 
 
@@ -817,15 +817,14 @@ static void setup_tri_edges( struct setup_stage *setup )
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct setup_stage *setup,
-			 struct edge *eleft,
+static void subtriangle( struct edge *eleft,
 			 struct edge *eright,
 			 unsigned lines )
 {
-   const int minx = setup->cliprect_minx;
-   const int maxx = setup->cliprect_maxx;
-   const int miny = setup->cliprect_miny;
-   const int maxy = setup->cliprect_maxy;
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
    int y, start_y, finish_y;
    int sy = (int)eleft->sy;
 
@@ -867,14 +866,14 @@ static void subtriangle( struct setup_stage *setup,
 
       if (left < right) {
          int _y = sy + y;
-         if (block(_y) != setup->span.y) {
-            flush_spans(setup);
-            setup->span.y = block(_y);
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
          }
 
-         setup->span.left[_y&1] = left;
-         setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
+         setup.span.left[_y&1] = left;
+         setup.span.right[_y&1] = right;
+         setup.span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -892,41 +891,41 @@ static void subtriangle( struct setup_stage *setup,
  * Do setup for triangle rasterization, then render the triangle.
  */
 static void
-setup_tri(struct setup_stage *setup, struct prim_header *prim)
+setup_tri(struct prim_header *prim)
 {
-   if (!setup_sort_vertices( setup, prim )) {
+   if (!setup_sort_vertices( prim )) {
       return; /* totally clipped */
    }
 
-   setup_tri_coefficients( setup );
-   setup_tri_edges( setup );
+   setup_tri_coefficients();
+   setup_tri_edges();
 
 #if 0
-   setup->quad.prim = PRIM_TRI;
+   setup.quad.prim = PRIM_TRI;
 #endif
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
    /*   init_constant_attribs( setup ); */
       
-   if (setup->oneoverarea < 0.0) {
+   if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
-      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
       /* emaj on right:
        */
-      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
 
-   flush_spans( setup );
+   flush_spans();
 }
 
 
@@ -939,7 +938,7 @@ void
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
    struct prim_header tri;
-   struct setup_stage setup;
+   /*struct setup_stage setup;*/
 
    tri.v[0] = (struct vertex_header *) v0;
    tri.v[1] = (struct vertex_header *) v1;
@@ -954,5 +953,5 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
-   setup_tri(&setup, &tri);
+   setup_tri(&tri);
 }
-- 
cgit v1.2.3


From 0b762d65433445282267c2e6d1dc9ba4eb64af7d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 12:13:04 -0700
Subject: Cell: fold setup_tri() into tri_draw()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 62 ++++++++++++----------------------------
 1 file changed, 19 insertions(+), 43 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 5bb2cb12e3..1c615a6e6a 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -47,9 +47,6 @@ struct vertex_header {
    float data[0][4];
 };
 
-struct prim_header {
-   struct vertex_header *v[3];
-};
 
 
 /* XXX fix this */
@@ -520,11 +517,10 @@ static void print_vertex(const struct vertex_header *v)
 }
 #endif
 
-static boolean setup_sort_vertices(const struct prim_header *prim )
+static boolean setup_sort_vertices(const struct vertex_header *v0,
+                                   const struct vertex_header *v1,
+                                   const struct vertex_header *v2)
 {
-   const struct vertex_header *v0 = prim->v[0];
-   const struct vertex_header *v1 = prim->v[1];
-   const struct vertex_header *v2 = prim->v[2];
 
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
@@ -888,22 +884,30 @@ static void subtriangle( struct edge *eleft,
 
 
 /**
- * Do setup for triangle rasterization, then render the triangle.
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
  */
-static void
-setup_tri(struct prim_header *prim)
+void
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
-   if (!setup_sort_vertices( prim )) {
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
       return; /* totally clipped */
    }
 
    setup_tri_coefficients();
    setup_tri_edges();
 
-#if 0
-   setup.quad.prim = PRIM_TRI;
-#endif
-
    setup.span.y = 0;
    setup.span.y_flags = 0;
    setup.span.right[0] = 0;
@@ -927,31 +931,3 @@ setup_tri(struct prim_header *prim)
 
    flush_spans();
 }
-
-
-
-/**
- * Draw triangle into tile at (tx, ty) (tile coords)
- * The tile data should have already been fetched.
- */
-void
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
-{
-   struct prim_header tri;
-   /*struct setup_stage setup;*/
-
-   tri.v[0] = (struct vertex_header *) v0;
-   tri.v[1] = (struct vertex_header *) v1;
-   tri.v[2] = (struct vertex_header *) v2;
-
-   setup.tx = tx;
-   setup.ty = ty;
-
-   /* set clipping bounds to tile bounds */
-   setup.cliprect_minx = tx * TILE_SIZE;
-   setup.cliprect_miny = ty * TILE_SIZE;
-   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
-   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
-
-   setup_tri(&tri);
-}
-- 
cgit v1.2.3


From 1af8e381af83ba1344f063a52e9fbf932a77e5f5 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 15:26:51 -0700
Subject: Cell: start to SIMD-ize triangle attribute interpolation

Using the spu_add(), etc intrinsics.
About a 15% speed-up with some tests.
---
 src/mesa/pipe/cell/spu/spu_main.h    |   7 ++
 src/mesa/pipe/cell/spu/spu_texture.c |   6 +-
 src/mesa/pipe/cell/spu/spu_texture.h |   2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     | 126 +++++++++++++++++++----------------
 4 files changed, 79 insertions(+), 62 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 8908bf8bc0..73f9ed29d6 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -36,6 +36,13 @@
 #include "pipe/p_state.h"
 
 
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 6d566a5006..7a1ca097c0 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -128,11 +128,11 @@ get_tex_tile(uint i, uint j)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(const float *texcoord)
+sample_texture(float4 texcoord)
 {
    /* wrap/repeat */
-   uint i = (uint) (texcoord[0] * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (texcoord[1] * spu.texture.height) % spu.texture.height;
+   uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
    uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index b75b7ac44f..938a42b549 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(const float *texcoord);
+sample_texture(float4 texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 1c615a6e6a..4fc6d90895 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -81,9 +81,9 @@ struct edge {
 
 struct interp_coef
 {
-   float a0[4];
-   float dadx[4];
-   float dady[4];
+   float4 a0;
+   float4 dadx;
+   float4 dady;
 };
 
 
@@ -201,36 +201,31 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, float result[4][4])
+eval_coeff(uint slot, float x, float y, float4 result[4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
-      {
-         uint i;
-         for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] =
-            result[QUAD_TOP_RIGHT][i] =
-            result[QUAD_BOTTOM_LEFT][i] =
-            result[QUAD_BOTTOM_RIGHT][i] = setup.coef[slot].a0[i];
-         }
-      }
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
       break;
 
    case INTERP_LINEAR:
       /* fall-through, for now */
    default:
       {
-         uint i;
-         const float *dadx = setup.coef[slot].dadx;
-         const float *dady = setup.coef[slot].dady;
-
-         /* loop over XYZW comps */
-         for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-            result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
-            result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
-            result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
-         }
+         register vector float dadx = setup.coef[slot].dadx.v;
+         register vector float dady = setup.coef[slot].dady.v;
+         register vector float topLeft
+            = spu_add(setup.coef[slot].a0.v,
+                      spu_add(spu_mul(spu_splats(x), dadx),
+                              spu_mul(spu_splats(y), dady)));
+
+         result[QUAD_TOP_LEFT].v = topLeft;
+         result[QUAD_TOP_RIGHT].v = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT].v = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT].v = spu_add(spu_add(topLeft, dadx), dady);
       }
    }
 }
@@ -240,28 +235,46 @@ static INLINE void
 eval_z(float x, float y, float result[4])
 {
    const uint slot = 0;
-   const uint i = 2;
-   const float *dadx = setup.coef[slot].dadx;
-   const float *dady = setup.coef[slot].dady;
-
-   result[QUAD_TOP_LEFT] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-   result[QUAD_TOP_RIGHT] = result[0] + dadx[i];
-   result[QUAD_BOTTOM_LEFT] = result[0] + dady[i];
-   result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i];
+   const float dzdx = setup.coef[slot].dadx.f[2];
+   const float dzdy = setup.coef[slot].dady.f[2];
+   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+#if 1
+   result[QUAD_TOP_LEFT] = topLeft;
+   result[QUAD_TOP_RIGHT] = topLeft + dzdx;
+   result[QUAD_BOTTOM_LEFT] = topLeft + dzdy;
+   result[QUAD_BOTTOM_RIGHT] = topLeft + dzdx + dzdy;
+#else
+   /* XXX vectorize */
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs
+      = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   vector float *res = (vector float *) result;
+   *res = spu_add(topLeftv, derivs);
+#endif
 }
 
 
-static INLINE uint
-pack_color(const float color[4])
+static INLINE void
+pack_colors(uint uicolors[4], const float4 fcolors[4])
 {
+   /* XXX grab the code for _pack_rgba8() and use the shuffle
+    * command to do the swizzling seen here.
+    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return _pack_rgba8(color[3], color[0], color[1], color[2]);
+      uicolors[0] = _pack_rgba8(fcolors[0].f[3], fcolors[0].f[0], fcolors[0].f[1], fcolors[0].f[2]);
+      uicolors[1] = _pack_rgba8(fcolors[1].f[3], fcolors[1].f[0], fcolors[1].f[1], fcolors[1].f[2]);
+      uicolors[2] = _pack_rgba8(fcolors[2].f[3], fcolors[2].f[0], fcolors[2].f[1], fcolors[2].f[2]);
+      uicolors[3] = _pack_rgba8(fcolors[3].f[3], fcolors[0].f[0], fcolors[3].f[1], fcolors[3].f[2]);
+      break;
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return _pack_rgba8(color[2], color[1], color[0], color[3]);
+      uicolors[0] = _pack_rgba8(fcolors[0].f[2], fcolors[0].f[1], fcolors[0].f[0], fcolors[0].f[3]);
+      uicolors[1] = _pack_rgba8(fcolors[1].f[2], fcolors[1].f[1], fcolors[1].f[0], fcolors[1].f[3]);
+      uicolors[2] = _pack_rgba8(fcolors[2].f[2], fcolors[2].f[1], fcolors[2].f[0], fcolors[2].f[3]);
+      uicolors[3] = _pack_rgba8(fcolors[3].f[2], fcolors[3].f[1], fcolors[3].f[0], fcolors[3].f[3]);
+      break;
    default:
       ASSERT(0);
-      return 0;
    }
 }
 
@@ -379,7 +392,7 @@ emit_quad( int x, int y, unsigned mask )
    uint colors[4];  /* indexed by QUAD_x */
 
    if (spu.texture.start) {
-      float texcoords[4][4];
+      float4 texcoords[4];
       uint i;
       eval_coeff(2, (float) x, (float) y, texcoords);
       for (i = 0; i < 4; i++) {
@@ -387,12 +400,9 @@ emit_quad( int x, int y, unsigned mask )
       }
    }
    else {
-      float fcolors[4][4];
+      float4 fcolors[4];
       eval_coeff(1, (float) x, (float) y, fcolors);
-      colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
-      colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
-      colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
-      colors[QUAD_BOTTOM_RIGHT] = pack_color(fcolors[QUAD_BOTTOM_RIGHT]);
+      pack_colors(colors, fcolors);
    }
 
    if (spu.depth_stencil.depth.enabled) {
@@ -645,12 +655,12 @@ static void const_coeff(uint slot)
    ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
    for (i = 0; i < 4; i++) {
-      setup.coef[slot].dadx[i] = 0;
-      setup.coef[slot].dady[i] = 0;
+      setup.coef[slot].dadx.f[i] = 0;
+      setup.coef[slot].dady.f[i] = 0;
 
       /* need provoking vertex info!
        */
-      setup.coef[slot].a0[i] = setup.vprovoke->data[slot][i];
+      setup.coef[slot].a0.f[i] = setup.vprovoke->data[slot][i];
    }
 }
 
@@ -670,8 +680,8 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
    
       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-      setup.coef[slot].dadx[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady[i] = b * setup.oneoverarea;
+      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
 
       /* calculate a0 as the value which would be sampled for the
        * fragment at (0,0), taking into account that we want to sample at
@@ -685,17 +695,17 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup.coef[slot].a0[i] = (setup.vmin->data[slot][i] - 
-                                 (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
-                                  setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0.f[i] = (setup.vmin->data[slot][i] - 
+                                 (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+                                  setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
    }
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
 		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx[i],
-		setup.coef[slot].dady[i]);
+		setup.coef[slot].dadx.f[i],
+		setup.coef[slot].dady.f[i]);
    */
 }
 
@@ -734,11 +744,11 @@ static void tri_persp_coeff( unsigned slot,
    assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup.coef[slot].dadx[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0[i] = (mina - 
-			    (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0.f[i] = (mina - 
+			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
 }
 #endif
 
-- 
cgit v1.2.3


From 3e4306c594e0aa42b2dbf31d7437564466fadfcc Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 20:40:26 -0700
Subject: Cell: prototype SIMD code for z testing

---
 src/mesa/pipe/cell/spu/spu_tile.h |  10 +++
 src/mesa/pipe/cell/spu/spu_tri.c  | 147 +++++++++++++++++++++++++++++---------
 2 files changed, 123 insertions(+), 34 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index f83dc009c2..18d1b3c117 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,6 +42,7 @@
 typedef union {
    ushort t16[TILE_SIZE][TILE_SIZE];
    uint   t32[TILE_SIZE][TILE_SIZE];
+   float4 f4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
 
 
@@ -83,9 +84,18 @@ clear_z_tile(tile_t *ztile)
                TILE_SIZE * TILE_SIZE);
    }
    else {
+      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
+#if SIMD_Z
+      union fi z;
+      z.f = 1.0;
+      memset32((uint*) ztile->t32,
+               z.i,/*spu.fb.depth_clear_value,*/
+               TILE_SIZE * TILE_SIZE);
+#else
       memset32((uint*) ztile->t32,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
+#endif
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 4fc6d90895..e436e153ec 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -40,6 +40,19 @@
 #include "spu_tri.h"
 
 
+/*
+ * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
+ * to do Z testing/updating.
+ */
+#define SIMD_Z 0
+
+#if SIMD_Z
+typedef vector unsigned int mask_t;
+#else
+typedef uint mask_t;
+#endif
+
+
 /**
  * Simplified types taken from other parts of Gallium
  */
@@ -231,26 +244,16 @@ eval_coeff(uint slot, float x, float y, float4 result[4])
 }
 
 
-static INLINE void
-eval_z(float x, float y, float result[4])
+static INLINE vector float
+eval_z(float x, float y)
 {
    const uint slot = 0;
    const float dzdx = setup.coef[slot].dadx.f[2];
    const float dzdy = setup.coef[slot].dady.f[2];
    const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
-#if 1
-   result[QUAD_TOP_LEFT] = topLeft;
-   result[QUAD_TOP_RIGHT] = topLeft + dzdx;
-   result[QUAD_BOTTOM_LEFT] = topLeft + dzdy;
-   result[QUAD_BOTTOM_RIGHT] = topLeft + dzdx + dzdy;
-#else
-   /* XXX vectorize */
    const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs
-      = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
-   vector float *res = (vector float *) result;
-   *res = spu_add(topLeftv, derivs);
-#endif
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
 }
 
 
@@ -279,14 +282,22 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
 }
 
 
-static uint
-do_depth_test(int x, int y, unsigned mask)
+
+static unsigned int
+do_depth_test(int x, int y, unsigned int mask)
 {
+   static const float4 zscale16
+      = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
+   static const float4 zscale32
+      = {.f={(float)0xffffffff,
+             (float)0xffffffff,
+             (float)0xffffffff,
+             (float)0xffffffff}};
    int ix = x - setup.cliprect_minx;
    int iy = y - setup.cliprect_miny;
-   float zvals[4];
+   float4 zvals;
 
-   eval_z((float) x, (float) y, zvals);
+   zvals.v = eval_z((float) x, (float) y);
 
    if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
@@ -300,9 +311,9 @@ do_depth_test(int x, int y, unsigned mask)
 
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      const float zscale = 65535.0;
+      zvals.v = spu_mul(zvals.v, zscale16.v);
       if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
+         uint z = (uint) zvals.f[0];
          if (z < ztile.t16[iy][ix])
             ztile.t16[iy][ix] = z;
          else
@@ -310,7 +321,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
+         uint z = (uint) zvals.f[1];
          if (z < ztile.t16[iy][ix+1])
             ztile.t16[iy][ix+1] = z;
          else
@@ -318,7 +329,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
+         uint z = (uint) zvals.f[2];
          if (z < ztile.t16[iy+1][ix])
             ztile.t16[iy+1][ix] = z;
          else
@@ -326,7 +337,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
+         uint z = (uint) zvals.f[3];
          if (z < ztile.t16[iy+1][ix+1])
             ztile.t16[iy+1][ix+1] = z;
          else
@@ -334,10 +345,10 @@ do_depth_test(int x, int y, unsigned mask)
       }
    }
    else {
-      const float zscale = (float) 0xffffffff;
+      zvals.v = spu_mul(zvals.v, zscale32.v);
       ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
       if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
+         uint z = (uint) zvals.f[0];
          if (z < ztile.t32[iy][ix])
             ztile.t32[iy][ix] = z;
          else
@@ -345,7 +356,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
+         uint z = (uint) zvals.f[1];
          if (z < ztile.t32[iy][ix+1])
             ztile.t32[iy][ix+1] = z;
          else
@@ -353,7 +364,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
+         uint z = (uint) zvals.f[2];
          if (z < ztile.t32[iy+1][ix])
             ztile.t32[iy+1][ix] = z;
          else
@@ -361,7 +372,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
+         uint z = (uint) zvals.f[3];
          if (z < ztile.t32[iy+1][ix+1])
             ztile.t32[iy+1][ix+1] = z;
          else
@@ -373,11 +384,45 @@ do_depth_test(int x, int y, unsigned mask)
 }
 
 
+
+
+static vector unsigned int
+do_depth_test_simd(int x, int y, vector unsigned int quadmask)
+{
+   int ix = (x - setup.cliprect_minx) / 2;
+   int iy = (y - setup.cliprect_miny) / 2;
+   float4 zvals;
+
+   vector unsigned int zmask;
+
+   zvals.v = eval_z((float) x, (float) y);
+
+   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+      /* now, _really_ clear the tile */
+      clear_z_tile(&ztile);
+   }
+   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+      /* make sure we've got the tile from main mem */
+      wait_on_mask(1 << TAG_READ_TILE_Z);
+   }
+   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+
+   /* XXX fetch Z value sooner to hide latency here */
+   zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
+   zmask = spu_and(zmask, quadmask);
+
+   ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
+   //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
+
+   return zmask;
+}
+
+
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( int x, int y, unsigned mask )
+emit_quad( int x, int y, mask_t mask )
 {
 #if 0
    struct softpipe_context *sp = setup.softpipe;
@@ -406,10 +451,17 @@ emit_quad( int x, int y, unsigned mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-      mask &= do_depth_test(x, y, mask);
+#if SIMD_Z
+      mask = do_depth_test_simd(x, y, mask);
+#else
+      mask = do_depth_test(x, y, mask);
+#endif
    }
 
-   if (mask) {
+#if !SIMD_Z
+   if (mask)
+#endif
+   {
       if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
@@ -420,6 +472,21 @@ emit_quad( int x, int y, unsigned mask )
       }
       tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
+#if SIMD_Z
+      if (spu_extract(mask, 0))
+         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
+      if (spu_extract(mask, 1))
+         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+      if (spu_extract(mask, 2))
+         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+      if (spu_extract(mask, 3))
+         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+#elif 0
+      /* SIMD_Z with swizzled color buffer (someday) */
+      vector float icolors = *((vector float *) &colors);
+      ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
+
+#else
       if (mask & MASK_TOP_LEFT)
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (mask & MASK_TOP_RIGHT)
@@ -428,7 +495,9 @@ emit_quad( int x, int y, unsigned mask )
          ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (mask & MASK_BOTTOM_RIGHT)
          ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+#endif
    }
+
 #endif
 }
 
@@ -450,8 +519,18 @@ static INLINE int block( int x )
  * this is pretty nasty...  may need to rework flush_spans again to
  * fix it, if possible.
  */
-static unsigned calculate_mask( int x )
+static mask_t calculate_mask( int x )
 {
+#if SIMD_Z
+   uint m0, m1, m2, m3;
+
+   m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
+   m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
+   m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
+   m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
+
+   return (vector unsigned int) {m0, m1, m2, m3};
+#else
    unsigned mask = 0x0;
 
    if (x >= setup.span.left[0] && x < setup.span.right[0]) 
@@ -467,6 +546,7 @@ static unsigned calculate_mask( int x )
       mask |= MASK_BOTTOM_RIGHT;
 
    return mask;
+#endif
 }
 
 
@@ -505,8 +585,7 @@ static void flush_spans( void )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( x, setup.span.y, 
-                 calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ) );
    }
 
    setup.span.y = 0;
-- 
cgit v1.2.3


From 98eecdb4868c181476cbe2423adaa327eee4a02e Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 10:37:18 -0800
Subject: Initial pass at vertex shader on SPU using TGSI VM

All of the code is wired in on the SPU side, but it is not called from
the PPU yet.  Instruction / declaration fetch still needs to be
implemented in spu_exec.c.
---
 src/mesa/pipe/cell/common.h                |   38 +
 src/mesa/pipe/cell/spu/Makefile            |    6 +-
 src/mesa/pipe/cell/spu/spu_exec.c          | 2355 ++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_exec.h          |  171 ++
 src/mesa/pipe/cell/spu/spu_main.c          |   28 +
 src/mesa/pipe/cell/spu/spu_util.c          |  165 ++
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  |  493 ++++++
 src/mesa/pipe/cell/spu/spu_vertex_shader.c |  224 +++
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   61 +
 9 files changed, 3540 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_exec.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_exec.h
 create mode 100644 src/mesa/pipe/cell/spu/spu_util.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_fetch.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_shader.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_shader.h

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d5e86863d4..80a1425ec7 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -83,6 +83,9 @@
 #define CELL_CMD_STATE_SAMPLER       12
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_VS_ARRAY_INFO 16
+#define CELL_CMD_VS_EXECUTE          17
 
 
 #define CELL_NUM_BUFFERS 4
@@ -116,6 +119,41 @@ struct cell_command_clear_surface
 } ALIGN16_ATTRIB;
 
 
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+    void *base;               /**< Base address of the 0th element. */
+    uint attr;                /**< Attribute that this state if for. */
+    uint pitch;               /**< Byte pitch from one entry to the next. */
+    enum pipe_format format;  /**< Pipe format of each entry. */
+} ALIGN16_ATTRIB;
+
+
+struct cell_shader_info
+{
+   unsigned processor;
+   unsigned num_outputs;
+
+   void *declarations;
+   unsigned num_declarations;
+   void *instructions;
+   unsigned num_instructions;
+   void *uniforms;
+} ALIGN16_ATTRIB;
+
+
+struct cell_command_vs
+{
+   struct cell_shader_info   shader;
+   void *elts;
+   unsigned num_elts;
+   unsigned bytes_per_elt;
+   void *vOut;
+} ALIGN16_ATTRIB;
+
+
 struct cell_command_render
 {
    uint opcode;       /**< CELL_CMD_RENDER */
diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index d5b30e1f27..2d031bfbc6 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -20,7 +20,11 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c
+	spu_tri.c \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
 
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..6888e97caf
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -0,0 +1,2355 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+#include "spu_exec.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   uint i;
+
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   for( i = 0; i < 4; i++ ) {
+      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
+      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
+      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
+      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
+      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
+      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
+      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
+      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+   }
+}
+
+
+static void
+micro_abs(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) fabs( (double) src->f[0] );
+   dst->f[1] = (float) fabs( (double) src->f[1] );
+   dst->f[2] = (float) fabs( (double) src->f[2] );
+   dst->f[3] = (float) fabs( (double) src->f[3] );
+}
+
+static void
+micro_add(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] + src1->f[0];
+   dst->f[1] = src0->f[1] + src1->f[1];
+   dst->f[2] = src0->f[2] + src1->f[2];
+   dst->f[3] = src0->f[3] + src1->f[3];
+}
+
+static void
+micro_iadd(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] + src1->i[0];
+   dst->i[1] = src0->i[1] + src1->i[1];
+   dst->i[2] = src0->i[2] + src1->i[2];
+   dst->i[3] = src0->i[3] + src1->i[3];
+}
+
+static void
+micro_and(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] & src1->u[0];
+   dst->u[1] = src0->u[1] & src1->u[1];
+   dst->u[2] = src0->u[2] & src1->u[2];
+   dst->u[3] = src0->u[3] & src1->u[3];
+}
+
+static void
+micro_ceil(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) ceil( (double) src->f[0] );
+   dst->f[1] = (float) ceil( (double) src->f[1] );
+   dst->f[2] = (float) ceil( (double) src->f[2] );
+   dst->f[3] = (float) ceil( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_cos(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) cos( (double) src->f[0] );
+   dst->f[1] = (float) cos( (double) src->f[1] );
+   dst->f[2] = (float) cos( (double) src->f[2] );
+   dst->f[3] = (float) cos( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_ddx(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_div(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] / src1->f[0];
+   dst->f[1] = src0->f[1] / src1->f[1];
+   dst->f[2] = src0->f[2] / src1->f[2];
+   dst->f[3] = src0->f[3] / src1->f[3];
+}
+
+static void
+micro_udiv(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] / src1->u[0];
+   dst->u[1] = src0->u[1] / src1->u[1];
+   dst->u[2] = src0->u[2] / src1->u[2];
+   dst->u[3] = src0->u[3] / src1->u[3];
+}
+
+static void
+micro_eq(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ieq(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_exp2(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src)
+{
+#if 0
+   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
+   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
+   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
+   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+#endif
+}
+
+static void
+micro_f2it(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->i[0] = (int) src->f[0];
+   dst->i[1] = (int) src->f[1];
+   dst->i[2] = (int) src->f[2];
+   dst->i[3] = (int) src->f[3];
+}
+
+static void
+micro_f2ut(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->u[0] = (uint) src->f[0];
+   dst->u[1] = (uint) src->f[1];
+   dst->u[2] = (uint) src->f[2];
+   dst->u[3] = (uint) src->f[3];
+}
+
+static void
+micro_flr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) floor( (double) src->f[0] );
+   dst->f[1] = (float) floor( (double) src->f[1] );
+   dst->f[2] = (float) floor( (double) src->f[2] );
+   dst->f[3] = (float) floor( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_frc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
+   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
+   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
+   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_ge(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_i2f(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) src->i[0];
+   dst->f[1] = (float) src->i[1];
+   dst->f[2] = (float) src->i[2];
+   dst->f[3] = (float) src->i[3];
+}
+
+static void
+micro_lg2(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
+   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
+   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
+   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+#endif
+}
+
+static void
+micro_lt(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ilt(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_ult(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+}
+
+static void
+micro_max(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imax(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umax(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_min(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umod(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] % src1->u[0];
+   dst->u[1] = src0->u[1] % src1->u[1];
+   dst->u[2] = src0->u[2] % src1->u[2];
+   dst->u[3] = src0->u[3] % src1->u[3];
+}
+
+static void
+micro_mul(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] * src1->f[0];
+   dst->f[1] = src0->f[1] * src1->f[1];
+   dst->f[2] = src0->f[2] * src1->f[2];
+   dst->f[3] = src0->f[3] * src1->f[3];
+}
+
+static void
+micro_imul(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] * src1->i[0];
+   dst->i[1] = src0->i[1] * src1->i[1];
+   dst->i[2] = src0->i[2] * src1->i[2];
+   dst->i[3] = src0->i[3] * src1->i[3];
+}
+
+static void
+micro_imul64(
+   union spu_exec_channel *dst0,
+   union spu_exec_channel *dst1,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst1->i[0] = src0->i[0] * src1->i[0];
+   dst1->i[1] = src0->i[1] * src1->i[1];
+   dst1->i[2] = src0->i[2] * src1->i[2];
+   dst1->i[3] = src0->i[3] * src1->i[3];
+   dst0->i[0] = 0;
+   dst0->i[1] = 0;
+   dst0->i[2] = 0;
+   dst0->i[3] = 0;
+}
+
+static void
+micro_umul64(
+   union spu_exec_channel *dst0,
+   union spu_exec_channel *dst1,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst1->u[0] = src0->u[0] * src1->u[0];
+   dst1->u[1] = src0->u[1] * src1->u[1];
+   dst1->u[2] = src0->u[2] * src1->u[2];
+   dst1->u[3] = src0->u[3] * src1->u[3];
+   dst0->u[0] = 0;
+   dst0->u[1] = 0;
+   dst0->u[2] = 0;
+   dst0->u[3] = 0;
+}
+
+static void
+micro_movc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2 )
+{
+   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
+   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
+   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
+   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
+}
+
+static void
+micro_neg(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = -src->f[0];
+   dst->f[1] = -src->f[1];
+   dst->f[2] = -src->f[2];
+   dst->f[3] = -src->f[3];
+}
+
+static void
+micro_ineg(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_not(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_or(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] | src1->u[0];
+   dst->u[1] = src0->u[1] | src1->u[1];
+   dst->u[2] = src0->u[2] | src1->u[2];
+   dst->u[3] = src0->u[3] | src1->u[3];
+}
+
+static void
+micro_pow(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+#if 0
+   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
+   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
+   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
+   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+#endif
+}
+
+static void
+micro_rnd(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
+   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
+   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
+   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+#endif
+}
+
+static void
+micro_shl(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] << src1->i[0];
+   dst->i[1] = src0->i[1] << src1->i[1];
+   dst->i[2] = src0->i[2] << src1->i[2];
+   dst->i[3] = src0->i[3] << src1->i[3];
+}
+
+static void
+micro_ishr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] >> src1->i[0];
+   dst->i[1] = src0->i[1] >> src1->i[1];
+   dst->i[2] = src0->i[2] >> src1->i[2];
+   dst->i[3] = src0->i[3] >> src1->i[3];
+}
+
+static void
+micro_trunc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0 )
+{
+   dst->f[0] = (float) (int) src0->f[0];
+   dst->f[1] = (float) (int) src0->f[1];
+   dst->f[2] = (float) (int) src0->f[2];
+   dst->f[3] = (float) (int) src0->f[3];
+}
+
+static void
+micro_ushr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] >> src1->u[0];
+   dst->u[1] = src0->u[1] >> src1->u[1];
+   dst->u[2] = src0->u[2] >> src1->u[2];
+   dst->u[3] = src0->u[3] >> src1->u[3];
+}
+
+static void
+micro_sin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) sin( (double) src->f[0] );
+   dst->f[1] = (float) sin( (double) src->f[1] );
+   dst->f[2] = (float) sin( (double) src->f[2] );
+   dst->f[3] = (float) sin( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_sqrt( union spu_exec_channel *dst,
+            const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) sqrt( (double) src->f[0] );
+   dst->f[1] = (float) sqrt( (double) src->f[1] );
+   dst->f[2] = (float) sqrt( (double) src->f[2] );
+   dst->f[3] = (float) sqrt( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_sub(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] - src1->f[0];
+   dst->f[1] = src0->f[1] - src1->f[1];
+   dst->f[2] = src0->f[2] - src1->f[2];
+   dst->f[3] = src0->f[3] - src1->f[3];
+}
+
+static void
+micro_u2f(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) src->u[0];
+   dst->f[1] = (float) src->u[1];
+   dst->f[2] = (float) src->u[2];
+   dst->f[3] = (float) src->u[3];
+}
+
+static void
+micro_xor(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] ^ src1->u[0];
+   dst->u[1] = src0->u[1] ^ src1->u[1];
+   dst->u[2] = src0->u[2] ^ src1->u[2];
+   dst->u[3] = src0->u[3] ^ src1->u[3];
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT:
+         chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         assert( index->i[1] < (int) mach->ImmLimit );
+         assert( index->i[2] < (int) mach->ImmLimit );
+         assert( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.i[0] *= 17;
+         index.i[1] *= 17;
+         index.i[2] *= 17;
+         index.i[3] *= 17;
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.i[0] *= 4096;
+         index.i[1] *= 4096;
+         index.i[2] *= 4096;
+         index.i[3] *= 4096;
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.i[0] += indir_index.i[0];
+         index.i[1] += indir_index.i[1];
+         index.i[2] += indir_index.i[2];
+         index.i[3] += indir_index.i[3];
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      micro_abs( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      micro_abs( chan, chan );
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kilp(struct spu_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   uint j;
+   float rgba[NUM_CHANNELS][QUAD_SIZE];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+
+   for (j = 0; j < 4; j++) {
+      r->f[j] = rgba[0][j];
+      g->f[j] = rgba[1][j];
+      b->f[j] = rgba[2][j];
+      a->f[j] = rgba[3][j];
+   }
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[1], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[1] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+         first = decl->u.DeclarationRange.First;
+         last = decl->u.DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Interpolation.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+	 micro_f2it( &r[0], &r[0] );
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   /* TGSI_OPCODE_SWZ */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	    FETCH( &r[1], 0, CHAN_Y );
+	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+	    FETCH( &r[2], 0, CHAN_W );
+	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
+	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
+	    micro_pow( &r[1], &r[1], &r[2] );
+	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Z );
+	 }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sqrt( &r[0], &r[0] );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_mul( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+       micro_mul( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+	 micro_mul( &r[0], &r[0], &r[1] );
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_min()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_max()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_mul( &r[0], &r[0], &r[1] );
+         FETCH( &r[1], 2, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_sub( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_sub( &r[1], &r[1], &r[2] );
+         micro_mul( &r[0], &r[0], &r[1] );
+         micro_add( &r[0], &r[0], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_frc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_flr( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_rnd( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_lg2( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_pow( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+
+      micro_mul( &r[2], &r[0], &r[1] );
+
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      micro_mul( &r[5], &r[3], &r[4] );
+      micro_sub( &r[2], &r[2], &r[5] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+
+      micro_mul( &r[3], &r[3], &r[2] );
+
+      FETCH(&r[5], 0, CHAN_X);
+
+      micro_mul( &r[1], &r[1], &r[5] );
+      micro_sub( &r[3], &r[3], &r[1] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      micro_mul( &r[5], &r[5], &r[4] );
+      micro_mul( &r[0], &r[0], &r[2] );
+      micro_sub( &r[5], &r[5], &r[0] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          micro_abs( &r[0], &r[0] );
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_cos( &r[0], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddx( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddy( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* for enabled ExecMask bits, set the killed bit */
+      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1],
+                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
+                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sin( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         micro_cos( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         micro_sin( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ceil( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_i2f( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_not( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_trunc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_shl( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ishr( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_and( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_or( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_xor( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   for (i = 0; i < mach->NumDeclarations; i++) {
+      exec_declaration( mach, mach->Declarations+i );
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      assert(pc < mach->NumInstructions);
+      exec_instruction( mach, mach->Instructions + pc, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..89e422ba48
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -0,0 +1,171 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/tgsi/exec/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+				      + TGSI_EXEC_NUM_ADDRS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 6886f283be..9daa3ec735 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -36,6 +36,7 @@
 #include "spu_render.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+#include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
 
@@ -50,6 +51,7 @@ boolean Debug = FALSE;
 
 struct spu_global spu;
 
+struct spu_vs_context draw;
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -264,6 +266,18 @@ cmd_state_vertex_info(const struct vertex_info *vinfo)
 }
 
 
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_ATTRIB_MAX);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.format[attr] = vs_info->format;
+   draw.vertex_fetch.dirty = 1;
+}
+
 
 static void
 cmd_finish(void)
@@ -374,6 +388,20 @@ cmd_batch(uint opcode)
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct vertex_info) / 4);
          break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_array_info) / 4);
+         break;
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw,
+                                   (struct cell_command_vs *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_command_vs) / 4);
+         break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
          ASSERT(0);
diff --git a/src/mesa/pipe/cell/spu/spu_util.c b/src/mesa/pipe/cell/spu/spu_util.c
new file mode 100644
index 0000000000..ac373240c1
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_util.c
@@ -0,0 +1,165 @@
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..b8f8c52eed
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,493 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+
+
+#define DRAW_DBG 0
+
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static spu_fetch_func get_fetch_func( enum pipe_format format )
+{
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+
+static void fetch_xyz_rgb( struct spu_vs_context *draw,
+			   struct spu_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
+{
+   assert(count <= 4);
+
+//   _mesa_printf("%s\n", __FUNCTION__);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
+			      struct spu_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch( struct spu_vs_context *draw,
+				  struct spu_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for (/* empty */; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   unsigned i;
+
+   
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      draw->vertex_fetch.fetch[i] =
+          get_fetch_func(draw->vertex_fetch.format[i]);
+   }
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (draw->vertex_fetch.nr_attrs) {
+   case 2:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..e694ff729f
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,224 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "pipe/draw/draw_private.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/cell/common.h"
+
+#define DBG_VS 0
+
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   struct vertex_header *vOut[])
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+
+   /* Consts does not require 16 byte alignment. */
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
+					   draw->nr_planes);
+      vOut[j]->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      vOut[j]->data[0][0] = x * scale[0] + trans[0];
+      vOut[j]->data[0][1] = y * scale[1] + trans[1];
+      vOut[j]->data[0][2] = z * scale[2] + trans[2];
+      vOut[j]->data[0][3] = w;
+
+#if DBG_VS
+      printf("output[%d]win: %f %f %f %f\n", j,
+             vOut[j]->data[0][0],
+             vOut[j]->data[0][1],
+             vOut[j]->data[0][2],
+             vOut[j]->data[0][3]);
+#endif
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+#if DBG_VS
+         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+                vOut[j]->data[slot][0],
+                vOut[j]->data[slot][1],
+                vOut[j]->data[slot][2],
+                vOut[j]->data[slot][3]);
+#endif
+      }
+   } /* loop over vertices */
+}
+
+
+static void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       void *uniforms,
+		       void *planes,
+		       unsigned nr_planes,
+		       unsigned num_outputs
+		       )
+{
+   draw->constants = (float (*)[4]) uniforms;
+
+   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
+   draw->nr_planes = nr_planes;
+   draw->num_vs_outputs = num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+}
+
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs)
+{
+   unsigned i;
+   unsigned j;
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->shader.instructions;
+   draw->machine.NumInstructions = vs->shader.num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->shader.declarations;
+   draw->machine.NumDeclarations = vs->shader.num_declarations;
+
+   spu_bind_vertex_shader(draw, vs->shader.uniforms,
+			  NULL, -1,
+			  vs->shader.num_outputs);
+   
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+      unsigned elts[4];
+
+      for (j = 0; j < batch_size; j++) {
+	 switch (vs->bytes_per_elt) {
+	 case 1: elts[j] = ((unsigned char *) vs->elts)[i + j]; break;
+	 case 2: elts[j] = ((unsigned short *)vs->elts)[i + j]; break;
+	 case 4: elts[j] = ((unsigned int *)  vs->elts)[i + j]; break;
+	 }
+      }
+
+      run_vertex_program(draw, elts, batch_size,
+			 (struct vertex_header (*)[]) vs->vOut);
+   }
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..c52f38fd02
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,61 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      enum pipe_format format[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
+      spu_full_fetch_func fetch_func;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
-- 
cgit v1.2.3


From 5028f0fcaca0d3a521f7ec130f4bbea2600bce16 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 10:56:53 -0800
Subject: Initial pass at instruction / declaration fetch

---
 src/mesa/pipe/cell/spu/spu_exec.c | 22 ++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_main.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 6888e97caf..f43278198e 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -50,6 +50,9 @@
  *   Brian Paul
  */
 
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
@@ -57,6 +60,7 @@
 #include "pipe/tgsi/util/tgsi_parse.h"
 #include "pipe/tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
+#include "spu_main.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -2329,12 +2333,30 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
    /* execute declarations (interpolants) */
    for (i = 0; i < mach->NumDeclarations; i++) {
+      uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_declaration decl;
+      unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+      unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+
+      mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
       exec_declaration( mach, mach->Declarations+i );
    }
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
+      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_instruction inst;
+      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
+      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
+
       assert(pc < mach->NumInstructions);
+      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
       exec_instruction( mach, mach->Instructions + pc, &pc );
    }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 73f9ed29d6..8be5268f52 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -96,6 +96,7 @@ extern boolean Debug;
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
 #define TAG_TEXTURE_TILE      19
+#define TAG_INSTRUCTION_FETCH 20
 
 
-- 
cgit v1.2.3


From fc4620554a3eed2a4032d9f6bd349acfd152682c Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 11:28:06 -0800
Subject: Implement vertex fetch / vertex shader output write-back

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 32 +++++++++++----
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 62 +++++++++++++++---------------
 src/mesa/pipe/draw/draw_context.c          |  5 ++-
 3 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index b8f8c52eed..0192227d57 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -30,11 +30,13 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include <spu_mfcio.h>
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"
+#include "spu_main.h"
 
 
 #define DRAW_DBG 0
@@ -412,16 +414,18 @@ static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-static void generic_vertex_fetch( struct spu_vs_context *draw,
-				  struct spu_exec_machine *machine,
-				  const unsigned *elts,
-				  unsigned count )
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
    assert(count <= 4);
 
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
 //   _mesa_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
@@ -441,13 +445,23 @@ static void generic_vertex_fetch( struct spu_vs_context *draw,
        * a prototype for an sse implementation, which would have
        * difficulties doing that.
        */
-      for (i = 0; i < count; i++) 
-	 fetch( src + elts[i] * pitch, p[i] );
+      for (i = 0; i < count; i++) {
+         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
+         const unsigned long addr = src + elts[i] * pitch;
+         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+
+         mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+         wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+
+         fetch(buffer, p[i]);
+      }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
       
       /* Transpose/swizzle into sse-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
@@ -475,6 +489,9 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 
+   /* Disable the fast path because they don't use mfc_get yet.
+    */
+#if 0
    switch (draw->vertex_fetch.nr_attrs) {
    case 2:
       if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
@@ -490,4 +507,5 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    default:
       break;
    }
+#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index e694ff729f..595f54b0eb 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -32,6 +32,8 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
+#include <spu_mfcio.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -40,9 +42,7 @@
 #include "pipe/draw/draw_private.h"
 #include "pipe/draw/draw_context.h"
 #include "pipe/cell/common.h"
-
-#define DBG_VS 0
-
+#include "spu_main.h"
 
 static INLINE unsigned
 compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
@@ -110,6 +110,12 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+			   + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+	  (struct vertex_header *) buffer;
+      const unsigned vert_size = sizeof(struct vertex_header)
+	  + (sizeof(float) * 4 * draw->num_vs_outputs);
 
       /* Handle attr[0] (position) specially:
        *
@@ -117,14 +123,14 @@ run_vertex_program(struct spu_vs_context *draw,
        * program as a set of DP4 instructions appended to the
        * user-provided code.
        */
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
 
-      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
 					   draw->nr_planes);
-      vOut[j]->edgeflag = 1;
+      tmpOut->edgeflag = 1;
 
       /* divide by w */
       w = 1.0f / w;
@@ -133,35 +139,27 @@ run_vertex_program(struct spu_vs_context *draw,
       z *= w;
 
       /* Viewport mapping */
-      vOut[j]->data[0][0] = x * scale[0] + trans[0];
-      vOut[j]->data[0][1] = y * scale[1] + trans[1];
-      vOut[j]->data[0][2] = z * scale[2] + trans[2];
-      vOut[j]->data[0][3] = w;
-
-#if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
-             vOut[j]->data[0][0],
-             vOut[j]->data[0][1],
-             vOut[j]->data[0][2],
-             vOut[j]->data[0][3]);
-#endif
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
        */
       for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-#if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
-                vOut[j]->data[slot][0],
-                vOut[j]->data[slot][1],
-                vOut[j]->data[slot][2],
-                vOut[j]->data[slot][3]);
-#endif
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
       }
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+
    } /* loop over vertices */
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
 }
 
 
diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index e8ca1f035b..711bcd02f6 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -71,10 +71,11 @@ struct draw_context *draw_create( void )
     */
    {
       uint i;
-      char *tmp = (char*) MALLOC( Elements(draw->vcache.vertex) * MAX_VERTEX_SIZE );
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
 
       for (i = 0; i < Elements(draw->vcache.vertex); i++)
-	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * MAX_VERTEX_SIZE);
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
    draw->convert_wide_points = TRUE;
-- 
cgit v1.2.3


From d798e7e2689338918218bbde5b8a5d68e3ca8c22 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 11:43:04 -0800
Subject: Provide mechanism to hook in custom vertex shader cache flush
 function

---
 src/mesa/pipe/draw/draw_context.c | 2 ++
 src/mesa/pipe/draw/draw_prim.c    | 2 +-
 src/mesa/pipe/draw/draw_private.h | 5 +++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index 711bcd02f6..87f4969983 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -78,6 +78,8 @@ struct draw_context *draw_create( void )
 	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
+   draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+
    draw->convert_wide_points = TRUE;
    draw->convert_wide_lines = TRUE;
 
diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 41b3fddcc1..58400213d7 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -127,7 +127,7 @@ void draw_do_flush( struct draw_context *draw, unsigned flags )
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
       if (draw->vs.queue_nr)
-	 draw_vertex_shader_queue_flush(draw);
+         (*draw->shader_queue_flush)(draw);
 
       if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
 	 if (draw->pq.queue_nr)
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 21de400676..fea6d94ed8 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -240,6 +240,11 @@ struct draw_context
       unsigned queue_nr;
    } vs;
 
+   /**
+    * Run the vertex shader on all vertices in the vertex queue.
+    */
+   void (*shader_queue_flush)(struct draw_context *draw);
+
    /* Prim pipeline queue:
     */
    struct {
-- 
cgit v1.2.3


From 7abddcf123f5726ea8b20ffb53100524a5da55d8 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 10:43:23 -0800
Subject: Pass ptr to local memory copy instead of main memory to
 exec_instruction

This was essentially a cut-and-paste bug when the instruction fetcher
was added.  Also, the test for TGSI_PROCESSOR_FRAGMENT was moved
outside the loop for exec_declaration.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index f43278198e..b3db6716d5 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -2332,17 +2332,19 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
 
    /* execute declarations (interpolants) */
-   for (i = 0; i < mach->NumDeclarations; i++) {
-      uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
-      struct tgsi_full_declaration decl;
-      unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
-      unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+	 struct tgsi_full_declaration decl;
+	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
 
-      mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
-      memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-      exec_declaration( mach, mach->Declarations+i );
+	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
+	 exec_declaration( mach, decl );
+      }
    }
 
    /* execute instructions, until pc is set to -1 */
@@ -2357,7 +2359,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
       wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
       memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
-      exec_instruction( mach, mach->Instructions + pc, &pc );
+      exec_instruction( mach, & inst, &pc );
    }
 
 #if 0
-- 
cgit v1.2.3


From c1ffb57cdf817934470a2115f6bdca148bdae269 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 10:46:55 -0800
Subject: Missing amperstand in previous commit.  Oops.

---
 src/mesa/pipe/cell/spu/spu_exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index b3db6716d5..85b5815cad 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -2343,7 +2343,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
 	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-	 exec_declaration( mach, decl );
+	 exec_declaration( mach, &decl );
       }
    }
 
-- 
cgit v1.2.3


From ea1d5c43b28f16d5ff3bcc750d46143a35597a9e Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 12:59:09 -0800
Subject: Fetch uniforms from main memory.

---
 src/mesa/pipe/cell/spu/spu_exec.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 85b5815cad..78f7d0962f 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -791,12 +791,23 @@ fetch_src_file_channel(
    case TGSI_EXTSWIZZLE_Z:
    case TGSI_EXTSWIZZLE_W:
       switch( file ) {
-      case TGSI_FILE_CONSTANT:
-         chan->f[0] = mach->Consts[index->i[0]][swizzle];
-         chan->f[1] = mach->Consts[index->i[1]][swizzle];
-         chan->f[2] = mach->Consts[index->i[2]][swizzle];
-         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+      case TGSI_FILE_CONSTANT: {
+         unsigned char buffer[32] ALIGN16_ATTRIB;
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
+            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+
+            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
+                + (sizeof(float) * swizzle)], sizeof(float));
+         }
          break;
+      }
 
       case TGSI_FILE_INPUT:
          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
-- 
cgit v1.2.3


From 805aacfe604eaf9b414336318b8e170dd898123c Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 17:26:22 -0800
Subject: Fix size calculation in attribute fetch.

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 0192227d57..1e846868e3 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -446,14 +446,14 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * difficulties doing that.
        */
       for (i = 0; i < count; i++) {
-         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
-         const unsigned long addr = src + elts[i] * pitch;
-         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+         uint8_t buffer[32] ALIGN16_ATTRIB;
+         const unsigned long addr = src + (elts[i] * pitch);
+         const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+         memmove(& buffer, buffer + (addr & 0x0f), 16);
 
          fetch(buffer, p[i]);
       }
-- 
cgit v1.2.3


From bbbd5c166a780d70110d236d40d3babd9d0b6346 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 17:28:48 -0800
Subject: Implement micro_pow and micro_sqrt

Unimplemented micro ops get assertions for now.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 43 ++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 78f7d0962f..168bada3bb 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,6 +52,8 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/powf4.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
@@ -207,6 +209,7 @@ micro_ceil(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) ceil( (double) src->f[0] );
    dst->f[1] = (float) ceil( (double) src->f[1] );
@@ -220,6 +223,7 @@ micro_cos(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) cos( (double) src->f[0] );
    dst->f[1] = (float) cos( (double) src->f[1] );
@@ -307,6 +311,7 @@ micro_exp2(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src)
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
    dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
@@ -342,6 +347,7 @@ micro_flr(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) floor( (double) src->f[0] );
    dst->f[1] = (float) floor( (double) src->f[1] );
@@ -355,6 +361,7 @@ micro_frc(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
    dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
@@ -393,6 +400,7 @@ micro_lg2(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
    dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
@@ -649,12 +657,18 @@ micro_pow(
    const union spu_exec_channel *src0,
    const union spu_exec_channel *src1 )
 {
-#if 0
-   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
-   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
-   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
-   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
-#endif
+   vec_float4 s0 = (vec_float4) {
+      src0->f[0], src0->f[1], src0->f[2], src0->f[3]
+   };
+   vec_float4 s1 = (vec_float4) {
+      src1->f[0], src1->f[1], src1->f[2], src1->f[3]
+   };
+   vec_float4 d = _powf4(s0, s1);
+
+   dst->f[0] = spu_extract(d, 0);
+   dst->f[1] = spu_extract(d, 1);
+   dst->f[2] = spu_extract(d, 2);
+   dst->f[3] = spu_extract(d, 3);
 }
 
 static void
@@ -662,6 +676,7 @@ micro_rnd(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
    dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
@@ -722,6 +737,7 @@ micro_sin(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) sin( (double) src->f[0] );
    dst->f[1] = (float) sin( (double) src->f[1] );
@@ -734,12 +750,15 @@ static void
 micro_sqrt( union spu_exec_channel *dst,
             const union spu_exec_channel *src )
 {
-#if 0
-   dst->f[0] = (float) sqrt( (double) src->f[0] );
-   dst->f[1] = (float) sqrt( (double) src->f[1] );
-   dst->f[2] = (float) sqrt( (double) src->f[2] );
-   dst->f[3] = (float) sqrt( (double) src->f[3] );
-#endif
+   vec_float4 s = (vec_float4) {
+      src->f[0], src->f[1], src->f[2], src->f[3]
+   };
+   vec_float4 d = _sqrtf4(s);
+
+   dst->f[0] = spu_extract(d, 0);
+   dst->f[1] = spu_extract(d, 1);
+   dst->f[2] = spu_extract(d, 2);
+   dst->f[3] = spu_extract(d, 3);
 }
 
 static void
-- 
cgit v1.2.3


From dcfe7e1dca656dd897b7b0bdebbed3cee6f9cfd9 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:24:40 -0800
Subject: Elts are always ints, pass vOut pointers in-line in command

---
 src/mesa/pipe/cell/common.h                |  6 +++---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 14 ++------------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 80a1425ec7..fbbdf728a1 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -144,13 +144,13 @@ struct cell_shader_info
 } ALIGN16_ATTRIB;
 
 
+#define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
    struct cell_shader_info   shader;
-   void *elts;
    unsigned num_elts;
-   unsigned bytes_per_elt;
-   void *vOut;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 595f54b0eb..82165501c5 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -81,7 +81,7 @@ compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
 static void
 run_vertex_program(struct spu_vs_context *draw,
                    unsigned elts[4], unsigned count,
-                   struct vertex_header *vOut[])
+                   const uint64_t *vOut)
 {
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
@@ -206,17 +206,7 @@ spu_execute_vertex_shader(struct spu_vs_context *draw,
    
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
-      unsigned elts[4];
-
-      for (j = 0; j < batch_size; j++) {
-	 switch (vs->bytes_per_elt) {
-	 case 1: elts[j] = ((unsigned char *) vs->elts)[i + j]; break;
-	 case 2: elts[j] = ((unsigned short *)vs->elts)[i + j]; break;
-	 case 4: elts[j] = ((unsigned int *)  vs->elts)[i + j]; break;
-	 }
-      }
 
-      run_vertex_program(draw, elts, batch_size,
-			 (struct vertex_header (*)[]) vs->vOut);
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
    }
 }
-- 
cgit v1.2.3


From fea350d91133174254f544de30032049199991bf Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:25:47 -0800
Subject: Set machine->Processor

The default value is 0, which is TGSI_PROCESSOR_FRAGMENT...not correct
for a vertex shader!
---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 82165501c5..125b2c3a43 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -93,7 +93,8 @@ run_vertex_program(struct spu_vs_context *draw,
 
    assert(count <= 4);
 
-   /* Consts does not require 16 byte alignment. */
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
    ASSERT_ALIGN16(draw->constants);
    machine->Consts = (float (*)[4]) draw->constants;
 
-- 
cgit v1.2.3


From 6996b6c0559b32926188efe7b23574a1076b75ec Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:30:15 -0800
Subject: Handle CELL_CMD_VS_EXECUTE *only* outside batch commands.

---
 src/mesa/pipe/cell/common.h       | 3 ++-
 src/mesa/pipe/cell/spu/spu_main.c | 8 +++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index fbbdf728a1..a40cfb8210 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -133,7 +133,6 @@ struct cell_array_info
 
 struct cell_shader_info
 {
-   unsigned processor;
    unsigned num_outputs;
 
    void *declarations;
@@ -147,6 +146,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
+   uint opcode;       /**< CELL_CMD_VS_EXECUTE */
    struct cell_shader_info   shader;
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -190,6 +190,7 @@ struct cell_command
    struct cell_command_framebuffer fb;
    struct cell_command_clear_surface clear;
    struct cell_command_render render;
+   struct cell_command_vs vs;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 9daa3ec735..7105c0f897 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -397,11 +397,6 @@ cmd_batch(uint opcode)
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct cell_array_info) / 4);
          break;
-      case CELL_CMD_VS_EXECUTE:
-         spu_execute_vertex_shader(&draw,
-                                   (struct cell_command_vs *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_command_vs) / 4);
-         break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
          ASSERT(0);
@@ -470,6 +465,9 @@ main_loop(void)
             assert(pos_incr == 0);
          }
          break;
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+         break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
          break;
-- 
cgit v1.2.3


From 738a4292b18e5513935af3902b4ed9d1997f90d1 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:33:30 -0800
Subject: Correctly read / write vertex header from / to main memory

---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 125b2c3a43..ea5ffae6bc 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -112,11 +112,16 @@ run_vertex_program(struct spu_vs_context *draw,
       unsigned slot;
       float x, y, z, w;
       unsigned char buffer[sizeof(struct vertex_header)
-			   + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
       struct vertex_header *const tmpOut =
-	  (struct vertex_header *) buffer;
-      const unsigned vert_size = sizeof(struct vertex_header)
-	  + (sizeof(float) * 4 * draw->num_vs_outputs);
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
 
       /* Handle attr[0] (position) specially:
        *
@@ -155,12 +160,8 @@ run_vertex_program(struct spu_vs_context *draw,
          tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
       }
 
-      wait_on_mask(1 << TAG_VERTEX_BUFFER);
       mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
-
    } /* loop over vertices */
-
-   wait_on_mask(1 << TAG_VERTEX_BUFFER);
 }
 
 
-- 
cgit v1.2.3


From 42db5715988a23743084742bc52baaa568be6091 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:34:22 -0800
Subject: cell_array_info should not be 16-byte aligned

Forcing cell_array_info to be 16-byte aligned makes it more difficult
to stuff that state in batch commands.
---
 src/mesa/pipe/cell/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index a40cfb8210..533ad2cf6e 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -128,7 +128,7 @@ struct cell_array_info
     uint attr;                /**< Attribute that this state if for. */
     uint pitch;               /**< Byte pitch from one entry to the next. */
     enum pipe_format format;  /**< Pipe format of each entry. */
-} ALIGN16_ATTRIB;
+};
 
 
 struct cell_shader_info
-- 
cgit v1.2.3


From 524f99a4ab1f4dde0cab07ffd9e72a4d49c1e79d Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:40:24 -0800
Subject: Numerous small fixed to PPU-SPU vertex shader protocol

---
 src/mesa/pipe/cell/common.h                | 19 ++++++++++++-------
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 27 ++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 533ad2cf6e..28b0c59a0a 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,10 +124,10 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    void *base;               /**< Base address of the 0th element. */
-    uint attr;                /**< Attribute that this state if for. */
-    uint pitch;               /**< Byte pitch from one entry to the next. */
-    enum pipe_format format;  /**< Pipe format of each entry. */
+    uint64_t base;      /**< Base address of the 0th element. */
+    uint attr;          /**< Attribute that this state if for. */
+    uint pitch;         /**< Byte pitch from one entry to the next. */
+    uint format;        /**< Pipe format of each entry. */
 };
 
 
@@ -135,11 +135,13 @@ struct cell_shader_info
 {
    unsigned num_outputs;
 
-   void *declarations;
+   uint64_t declarations;
    unsigned num_declarations;
-   void *instructions;
+   uint64_t instructions;
    unsigned num_instructions;
-   void *uniforms;
+   uint64_t uniforms;
+   uint64_t  immediates;
+   unsigned num_immediates;
 } ALIGN16_ATTRIB;
 
 
@@ -151,6 +153,9 @@ struct cell_command_vs
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
    uint64_t vOut[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index ea5ffae6bc..c1cbbb6d1e 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -187,12 +187,22 @@ spu_bind_vertex_shader(struct spu_vs_context *draw,
 }
 
 
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
 void
 spu_execute_vertex_shader(struct spu_vs_context *draw,
-			  const struct cell_command_vs *vs)
+                          const struct cell_command_vs *vs)
 {
    unsigned i;
-   unsigned j;
+
+   const uint64_t immediate_addr = vs->shader.immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates)
+                 + (immediate_addr & 0x0f));
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
 
    draw->machine.Instructions = (struct tgsi_full_instruction *)
        vs->shader.instructions;
@@ -202,10 +212,17 @@ spu_execute_vertex_shader(struct spu_vs_context *draw,
        vs->shader.declarations;
    draw->machine.NumDeclarations = vs->shader.num_declarations;
 
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->shader.num_immediates);
+
    spu_bind_vertex_shader(draw, vs->shader.uniforms,
-			  NULL, -1,
-			  vs->shader.num_outputs);
-   
+                          vs->plane, vs->nr_planes,
+                          vs->shader.num_outputs);
+
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
 
-- 
cgit v1.2.3


From 87cc80297c62d7f8f2f22415e5099e3fbb7d229b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:10:45 -0800
Subject: Add driver_private field for drivers that hook shader_queue_flush.

---
 src/mesa/pipe/draw/draw_private.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index fea6d94ed8..7782db0477 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -259,6 +259,8 @@ struct draw_context
 #ifdef MESA_LLVM
    struct gallivm_cpu_engine *engine;
 #endif
+   
+   void *driver_private;
 };
 
 
-- 
cgit v1.2.3


From bcaf0dd8d23b3e8562078b3a3e07ef99ca940881 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:12:00 -0800
Subject: Use SPUs for vertex shader processing

---
 src/mesa/pipe/cell/ppu/Makefile             |   1 +
 src/mesa/pipe/cell/ppu/cell_context.c       |  12 ++-
 src/mesa/pipe/cell/ppu/cell_context.h       |   2 +
 src/mesa/pipe/cell/ppu/cell_vertex_shader.c | 118 ++++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/pipe/cell/ppu/cell_vertex_shader.c

diff --git a/src/mesa/pipe/cell/ppu/Makefile b/src/mesa/pipe/cell/ppu/Makefile
index e7f2562da7..50060f5cd3 100644
--- a/src/mesa/pipe/cell/ppu/Makefile
+++ b/src/mesa/pipe/cell/ppu/Makefile
@@ -34,6 +34,7 @@ SOURCES = \
 	cell_surface.c \
 	cell_texture.c \
 	cell_vbuf.c \
+	cell_vertex_shader.c \
 	cell_winsys.c
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index e8020a49bc..4885cd0d2c 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -39,6 +39,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
@@ -156,6 +157,15 @@ cell_destroy_context( struct pipe_context *pipe )
 }
 
 
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create();
+
+   draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+   draw->driver_private = cell;
+   return draw;
+}
 
 
 struct pipe_context *
@@ -242,7 +252,7 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_init_surface_functions(cell);
 
-   cell->draw = draw_create();
+   cell->draw = cell_draw_create(cell);
 
    cell_init_vbuf(cell);
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 65b89518ad..3b63419b5e 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -126,6 +126,8 @@ cell_context(struct pipe_context *pipe)
 extern struct pipe_context *
 cell_create_context(struct pipe_winsys *ws, struct cell_winsys *cws);
 
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..aef329a902
--- /dev/null
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,118 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "pipe/cell/common.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   unsigned *batch;
+   struct cell_array_info array_info;
+   unsigned i, j;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      array_info.opcode = CELL_CMD_STATE_VS_ARRAY_INFO;
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info.base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info.attr = i;
+      array_info.pitch = draw->vertex_fetch.pitch[i];
+      array_info.format = draw->vertex_element[i].src_format;
+
+      cell_batch_append(cell, & array_info, sizeof(array_info));
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(unsigned)
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->shader.num_outputs = draw->num_vs_outputs;
+   vs->shader.declarations = (uintptr_t) draw->machine.Declarations;
+   vs->shader.num_declarations = draw->machine.NumDeclarations;
+   vs->shader.instructions = (uintptr_t) draw->machine.Instructions;
+   vs->shader.num_instructions = draw->machine.NumInstructions;
+   vs->shader.uniforms = (uintptr_t) draw->user.constants;
+   vs->shader.immediates = (uintptr_t) draw->machine.Imms;
+   vs->shader.num_immediates = draw->machine.ImmLimit / 4;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].dest;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = vs->vOut[0];
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(& cell->pipe, PIPE_FLUSH_WAIT);
+   }
+
+   draw->vs.queue_nr = 0;
+}
-- 
cgit v1.2.3


From 2029ee48b1b5856ef3c9b4307f018bc6bd61ea6e Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:14:14 -0800
Subject: I don't know why using uint64_t for "base" doesn't work.  Ugh.

---
 src/mesa/pipe/cell/common.h       | 5 +++--
 src/mesa/pipe/cell/spu/spu_main.c | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 28b0c59a0a..05aeed83ab 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,11 +124,12 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;      /**< Base address of the 0th element. */
+    uint opcode;
+    uint base;          /**< Base address of the 0th element. */
     uint attr;          /**< Attribute that this state if for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
-};
+} ALIGN16_ATTRIB;
 
 
 struct cell_shader_info
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 7105c0f897..d6393048f5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -394,8 +394,8 @@ cmd_batch(uint opcode)
          pos += (1 + sizeof(struct pipe_viewport_state) / 4);
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_array_info) / 4);
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos]);
+         pos += (sizeof(struct cell_array_info) / 4);
          break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
-- 
cgit v1.2.3


From c285e06e93d01b9253dfc2fb1ab42480216b72d4 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:14:35 +0900
Subject: gallium: Add SCons as alternative build system for Gallium.

---
 .gitignore                                   |   2 +
 SConstruct                                   | 214 +++++++++++++
 src/mesa/SConscript                          | 435 +++++++++++++++++++++++++++
 src/mesa/drivers/dri/SConscript              |  48 +++
 src/mesa/drivers/dri/intel_winsys/SConscript |  41 +++
 src/mesa/pipe/SConscript                     |   9 +
 src/mesa/pipe/i915simple/SConscript          |  29 ++
 src/mesa/pipe/i965simple/SConscript          |  55 ++++
 src/mesa/pipe/softpipe/SConscript            |  42 +++
 9 files changed, 875 insertions(+)
 create mode 100644 SConstruct
 create mode 100644 src/mesa/SConscript
 create mode 100644 src/mesa/drivers/dri/SConscript
 create mode 100644 src/mesa/drivers/dri/intel_winsys/SConscript
 create mode 100644 src/mesa/pipe/SConscript
 create mode 100644 src/mesa/pipe/i915simple/SConscript
 create mode 100644 src/mesa/pipe/i965simple/SConscript
 create mode 100644 src/mesa/pipe/softpipe/SConscript

diff --git a/.gitignore b/.gitignore
index 033e6e10bd..bf50291fc1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ depend
 depend.bak
 lib
 lib64
+.sconsign*
+config.py
diff --git a/SConstruct b/SConstruct
new file mode 100644
index 0000000000..db6161ed51
--- /dev/null
+++ b/SConstruct
@@ -0,0 +1,214 @@
+#######################################################################
+# Top-level SConstruct
+
+import os
+import sys
+
+
+#######################################################################
+# Configuration options
+#
+# For example, invoke scons as 
+#
+#   scons debug=1 dri=0 x86=1
+#
+# to set configuration variables. Or you can write those options to a file
+# named config.py:
+#
+#   # config.py
+#   debug=1
+#   dri=0
+#   x86=1
+# 
+# Invoke
+#
+#   scons -h
+#
+# to get the full list of options. See scons manpage for more info.
+#  
+
+# TODO: auto-detect defaults
+opts = Options('config.py')
+opts.Add(BoolOption('debug', 'build debug version', False))
+opts.Add(BoolOption('dri', 'build dri drivers', False))
+opts.Add(EnumOption('machine', 'use machine-specific assembly code', 'x86',
+                     allowed_values=('generic', 'x86', 'x86-64')))
+
+env = Environment(options = opts)
+Help(opts.GenerateHelpText(env))
+
+# for debugging
+#print env.Dump()
+
+if 1:
+	# platform will be typically 'posix' or 'win32' 
+	platform = env['PLATFORM']
+else:
+	# platform will be one of 'linux', 'freebsd', 'win32', 'darwin', etc.
+	platform = sys.platform
+	if platform == 'linux2':
+		platform = 'linux' 
+
+# replicate options values in local variables
+debug = env['debug']
+dri = env['dri']
+machine = env['machine']
+
+# derived options
+x86 = machine == 'x86'
+gcc = platform == 'posix'
+msvc = platform == 'win32'
+
+Export([
+	'debug', 
+	'x86', 
+	'dri', 
+	'platform',
+	'gcc',
+	'msvc',
+])
+
+
+#######################################################################
+# Environment setup
+#
+# TODO: put the compiler specific settings in seperate files
+# TODO: auto-detect as much as possible
+
+         
+# Optimization flags
+if gcc:
+	if debug:
+		env.Append(CFLAGS = '-O0 -g3')
+		env.Append(CXXFLAGS = '-O0 -g3')
+	else:
+		env.Append(CFLAGS = '-O3 -g3')
+		env.Append(CXXFLAGS = '-O3 -g3')
+
+	env.Append(CFLAGS = '-Wall -Wmissing-prototypes -std=c99 -ffast-math -pedantic')
+	env.Append(CXXFLAGS = '-Wall -pedantic')
+	
+	# Be nice to Eclipse
+	env.Append(CFLAGS = '-fmessage-length=0')
+	env.Append(CXXFLAGS = '-fmessage-length=0')
+
+# Defines
+env.Append(CPPDEFINES = [
+	'_POSIX_SOURCE',
+	('_POSIX_C_SOURCE', '199309L'), 
+	'_SVID_SOURCE',
+	'_BSD_SOURCE', 
+	'_GNU_SOURCE',
+	
+	'PTHREADS',
+	'HAVE_ALIAS', 
+	'HAVE_POSIX_MEMALIGN',
+])
+
+if debug:
+	env.Append(CPPDEFINES = ['DEBUG'])
+else:
+	env.Append(CPPDEFINES = ['NDEBUG'])
+
+
+# Includes
+env.Append(CPPPATH = [
+	'#/include',
+	'#/src/mesa',
+	'#/src/mesa/main',
+	'#/src/mesa/pipe',
+	
+	'/usr/X11R6/include',
+])
+
+
+# x86 assembly
+if x86:
+	env.Append(CPPDEFINES = [
+		'USE_X86_ASM', 
+		'USE_MMX_ASM',
+		'USE_3DNOW_ASM',
+		'USE_SSE_ASM',
+	])
+	if gcc:	
+		env.Append(CFLAGS = '-m32')
+		env.Append(CXXFLAGS = '-m32')
+
+env.Append(LIBPATH = ['/usr/X11R6/lib'])
+
+env.Append(LIBS = [
+	'm',
+	'pthread',
+	'expat',
+	'dl',
+])
+
+# DRI
+if dri:
+	env.ParseConfig('pkg-config --cflags --libs libdrm')
+	env.Append(CPPDEFINES = [
+		('USE_EXTERNAL_DXTN_LIB', '1'), 
+		'IN_DRI_DRIVER',
+		'GLX_DIRECT_RENDERING',
+		'GLX_INDIRECT_RENDERING',
+	])
+
+# libGL
+if 1:
+	env.Append(LIBS = [
+		'X11',
+		'Xext',
+		'Xxf86vm',
+		'Xdamage',
+		'Xfixes',
+	])
+
+Export('env')
+
+
+#######################################################################
+# Convenience Library Builder
+# based on the stock StaticLibrary and SharedLibrary builders
+
+def createConvenienceLibBuilder(env):
+    """This is a utility function that creates the ConvenienceLibrary
+    Builder in an Environment if it is not there already.
+
+    If it is already there, we return the existing one.
+    """
+
+    try:
+        convenience_lib = env['BUILDERS']['ConvenienceLibrary']
+    except KeyError:
+        action_list = [ Action("$ARCOM", "$ARCOMSTR") ]
+        if env.Detect('ranlib'):
+            ranlib_action = Action("$RANLIBCOM", "$RANLIBCOMSTR")
+            action_list.append(ranlib_action)
+
+        convenience_lib = Builder(action = action_list,
+                                  emitter = '$LIBEMITTER',
+                                  prefix = '$LIBPREFIX',
+                                  suffix = '$LIBSUFFIX',
+                                  src_suffix = '$SHOBJSUFFIX',
+                                  src_builder = 'SharedObject')
+        env['BUILDERS']['ConvenienceLibrary'] = convenience_lib
+        env['BUILDERS']['Library'] = convenience_lib
+
+    return convenience_lib
+
+createConvenienceLibBuilder(env)
+
+
+#######################################################################
+# Invoke SConscripts
+
+# Put build output in a separate dir
+# TODO: make build_dir depend on platform and build type (check  
+#       http://www.scons.org/wiki/AdvancedBuildExample for an example)
+build_dir = 'build'
+
+SConscript(
+	'src/mesa/SConscript',
+	build_dir = build_dir,
+	duplicate = 0 # http://www.scons.org/doc/0.97/HTML/scons-user/x2261.html
+)
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
new file mode 100644
index 0000000000..70a98f3129
--- /dev/null
+++ b/src/mesa/SConscript
@@ -0,0 +1,435 @@
+#######################################################################
+# SConscript for mesa
+#
+# TODO: Split this into per-module SConscripts 
+
+
+Import('*')
+
+
+#######################################################################
+# Core sources
+
+MAIN_SOURCES = [
+	'main/api_arrayelt.c',
+	'main/api_loopback.c',
+	'main/api_noop.c',
+	'main/api_validate.c',
+	'main/accum.c',
+	'main/attrib.c',
+	'main/arrayobj.c',
+	'main/blend.c',
+	'main/bufferobj.c',
+	'main/buffers.c',
+	'main/clip.c',
+	'main/colortab.c',
+	'main/context.c',
+	'main/convolve.c',
+	'main/debug.c',
+	'main/depth.c',
+	'main/depthstencil.c',
+	'main/dlist.c',
+	'main/drawpix.c',
+	'main/enable.c',
+	'main/enums.c',
+	'main/eval.c',
+	'main/execmem.c',
+	'main/extensions.c',
+	'main/fbobject.c',
+	'main/feedback.c',
+	'main/ffvertex_prog.c',
+	'main/fog.c',
+	'main/framebuffer.c',
+	'main/get.c',
+	'main/getstring.c',
+	'main/hash.c',
+	'main/hint.c',
+	'main/histogram.c',
+	'main/image.c',
+	'main/imports.c',
+	'main/light.c',
+	'main/lines.c',
+	'main/matrix.c',
+	'main/mipmap.c',
+	'main/mm.c',
+	'main/pixel.c',
+	'main/points.c',
+	'main/polygon.c',
+	'main/queryobj.c',
+	'main/rastpos.c',
+	'main/rbadaptors.c',
+	'main/renderbuffer.c',
+	'main/shaders.c',
+	'main/state.c',
+	'main/stencil.c',
+	'main/texcompress.c',
+	'main/texcompress_s3tc.c',
+	'main/texcompress_fxt1.c',
+	'main/texenvprogram.c',
+	'main/texformat.c',
+	'main/teximage.c',
+	'main/texobj.c',
+	'main/texrender.c',
+	'main/texstate.c',
+	'main/texstore.c',
+	'main/varray.c',
+	'main/vtxfmt.c',
+]
+
+GLAPI_SOURCES = [
+	'main/dispatch.c',
+	'glapi/glapi.c',
+	'glapi/glthread.c',
+]
+
+MATH_SOURCES = [
+	'math/m_debug_clip.c',
+	'math/m_debug_norm.c',
+	'math/m_debug_xform.c',
+	'math/m_eval.c',
+	'math/m_matrix.c',
+	'math/m_translate.c',
+	'math/m_vector.c',
+	'math/m_xform.c',
+]
+
+VBO_SOURCES = [
+	'vbo/vbo_context.c',
+	'vbo/vbo_exec.c',
+	'vbo/vbo_exec_api.c',
+	'vbo/vbo_exec_array.c',
+	'vbo/vbo_exec_draw.c',
+	'vbo/vbo_exec_eval.c',
+	'vbo/vbo_rebase.c',
+	'vbo/vbo_split.c',
+	'vbo/vbo_split_copy.c',
+	'vbo/vbo_split_inplace.c',
+	'vbo/vbo_save.c',
+	'vbo/vbo_save_api.c',
+	'vbo/vbo_save_draw.c',
+	'vbo/vbo_save_loopback.c',
+]
+
+VF_SOURCES = [
+	'vf/vf.c',
+	'vf/vf_generic.c',
+	'vf/vf_sse.c',
+]
+
+DRAW_SOURCES = [
+	'pipe/draw/draw_clip.c',
+	'pipe/draw/draw_context.c',
+	'pipe/draw/draw_cull.c',
+	'pipe/draw/draw_debug.c',
+	'pipe/draw/draw_flatshade.c',
+	'pipe/draw/draw_offset.c',
+	'pipe/draw/draw_prim.c',
+	'pipe/draw/draw_stipple.c',
+	'pipe/draw/draw_twoside.c',
+	'pipe/draw/draw_unfilled.c',
+	'pipe/draw/draw_validate.c',
+	'pipe/draw/draw_vbuf.c',
+	'pipe/draw/draw_vertex.c',
+	'pipe/draw/draw_vertex_cache.c',
+	'pipe/draw/draw_vertex_fetch.c',
+	'pipe/draw/draw_vertex_shader.c',
+	'pipe/draw/draw_vertex_shader_llvm.c',
+	'pipe/draw/draw_vf.c',
+	'pipe/draw/draw_vf_generic.c',
+	'pipe/draw/draw_vf_sse.c',
+	'pipe/draw/draw_wide_prims.c',
+]
+
+TGSIEXEC_SOURCES = [
+	'pipe/tgsi/exec/tgsi_exec.c',
+	'pipe/tgsi/exec/tgsi_sse2.c',
+]
+
+TGSIUTIL_SOURCES = [
+	'pipe/tgsi/util/tgsi_build.c',
+	'pipe/tgsi/util/tgsi_dump.c',
+	'pipe/tgsi/util/tgsi_parse.c',
+	'pipe/tgsi/util/tgsi_util.c',
+]
+
+STATECACHE_SOURCES = [
+	'pipe/cso_cache/cso_hash.c',
+	'pipe/cso_cache/cso_cache.c',
+]
+
+PIPEUTIL_SOURCES = [
+	'pipe/util/p_tile.c',
+	'pipe/util/p_util.c',
+]
+
+STATETRACKER_SOURCES = [
+	'state_tracker/st_atom.c',
+	'state_tracker/st_atom_blend.c',
+	'state_tracker/st_atom_clip.c',
+	'state_tracker/st_atom_constbuf.c',
+	'state_tracker/st_atom_depth.c',
+	'state_tracker/st_atom_framebuffer.c',
+	'state_tracker/st_atom_pixeltransfer.c',
+	'state_tracker/st_atom_sampler.c',
+	'state_tracker/st_atom_scissor.c',
+	'state_tracker/st_atom_shader.c',
+	'state_tracker/st_atom_rasterizer.c',
+	'state_tracker/st_atom_stipple.c',
+	'state_tracker/st_atom_texture.c',
+	'state_tracker/st_atom_viewport.c',
+	'state_tracker/st_cb_accum.c',
+	'state_tracker/st_cb_bufferobjects.c',
+	'state_tracker/st_cb_clear.c',
+	'state_tracker/st_cb_flush.c',
+	'state_tracker/st_cb_drawpixels.c',
+	'state_tracker/st_cb_fbo.c',
+	'state_tracker/st_cb_feedback.c',
+	'state_tracker/st_cb_program.c',
+	'state_tracker/st_cb_queryobj.c',
+	'state_tracker/st_cb_rasterpos.c',
+	'state_tracker/st_cb_readpixels.c',
+	'state_tracker/st_cb_strings.c',
+	'state_tracker/st_cb_texture.c',
+	'state_tracker/st_cache.c',
+	'state_tracker/st_context.c',
+	'state_tracker/st_debug.c',
+	'state_tracker/st_draw.c',
+	'state_tracker/st_extensions.c',
+	'state_tracker/st_format.c',
+	'state_tracker/st_framebuffer.c',
+	'state_tracker/st_mesa_to_tgsi.c',
+	'state_tracker/st_program.c',
+	'state_tracker/st_texture.c',
+]
+
+SHADER_SOURCES = [
+	'shader/arbprogparse.c',
+	'shader/arbprogram.c',
+	'shader/atifragshader.c',
+	'shader/grammar/grammar_mesa.c',
+	'shader/nvfragparse.c',
+	'shader/nvprogram.c',
+	'shader/nvvertparse.c',
+	'shader/program.c',
+	'shader/prog_cache.c',
+	'shader/prog_debug.c',
+	'shader/prog_execute.c',
+	'shader/prog_instruction.c',
+	'shader/prog_parameter.c',
+	'shader/prog_print.c',
+	'shader/prog_statevars.c',
+	'shader/programopt.c',
+	'shader/shader_api.c',
+]
+
+SLANG_SOURCES = [
+	'shader/slang/slang_builtin.c',
+	'shader/slang/slang_codegen.c',
+	'shader/slang/slang_compile.c',
+	'shader/slang/slang_compile_function.c',
+	'shader/slang/slang_compile_operation.c',
+	'shader/slang/slang_compile_struct.c',
+	'shader/slang/slang_compile_variable.c',
+	'shader/slang/slang_emit.c',
+	'shader/slang/slang_ir.c',
+	'shader/slang/slang_label.c',
+	'shader/slang/slang_library_noise.c',
+	'shader/slang/slang_link.c',
+	'shader/slang/slang_log.c',
+	'shader/slang/slang_mem.c',
+	'shader/slang/slang_preprocess.c',
+	'shader/slang/slang_print.c',
+	'shader/slang/slang_simplify.c',
+	'shader/slang/slang_storage.c',
+	'shader/slang/slang_typeinfo.c',
+	'shader/slang/slang_vartable.c',
+	'shader/slang/slang_utility.c',
+]
+
+
+#######################################################################
+# Assembly sources
+
+ASM_C_SOURCES = [
+	'x86/common_x86.c',
+	'x86/x86.c',
+	'x86/3dnow.c',
+	'x86/sse.c',
+	'x86/rtasm/x86sse.c',
+	'sparc/sparc.c',
+	'ppc/common_ppc.c',
+	'x86-64/x86-64.c',
+]
+
+X86_SOURCES = [
+	'x86/common_x86_asm.S',
+	'x86/x86_xform2.S',
+	'x86/x86_xform3.S',
+	'x86/x86_xform4.S',
+	'x86/x86_cliptest.S',
+	'x86/mmx_blend.S',
+	'x86/3dnow_xform1.S',
+	'x86/3dnow_xform2.S',
+	'x86/3dnow_xform3.S',
+	'x86/3dnow_xform4.S',
+	'x86/3dnow_normal.S',
+	'x86/sse_xform1.S',
+	'x86/sse_xform2.S',
+	'x86/sse_xform3.S',
+	'x86/sse_xform4.S',
+	'x86/sse_normal.S',
+	'x86/read_rgba_span_x86.S',
+]
+
+X86_API = [
+	'x86/glapi_x86.S',
+]
+
+X86_64_SOURCES = [
+	'x86-64/xform4.S',
+]
+
+X86_64_API = [
+	'x86-64/glapi_x86-64.S',
+]
+
+SPARC_SOURCES = [
+	'sparc/clip.S',
+	'sparc/norm.S',
+	'sparc/xform.S',
+]
+
+SPARC_API = [
+	'sparc/glapi_sparc.S',
+]
+
+if x86:
+	ASM_SOURCES = ASM_C_SOURCES + X86_SOURCES 
+	API_SOURCES = X86_API
+else:
+	ASM_SOURCES = []
+	API_SOURCES = []
+
+
+#######################################################################
+# Driver sources
+
+
+X11_DRIVER_SOURCES = [
+	'pipe/xlib/glxapi.c',
+	'pipe/xlib/fakeglx.c',
+	'pipe/xlib/xfonts.c',
+	'pipe/xlib/xm_api.c',
+	'pipe/xlib/xm_winsys.c',
+	'pipe/xlib/xm_winsys_aub.c',
+	'pipe/xlib/brw_aub.c',
+]
+
+OSMESA_DRIVER_SOURCES = [
+	'drivers/osmesa/osmesa.c',
+]
+
+GLIDE_DRIVER_SOURCES = [
+	'drivers/glide/fxapi.c',
+	'drivers/glide/fxdd.c',
+	'drivers/glide/fxddspan.c',
+	'drivers/glide/fxddtex.c',
+	'drivers/glide/fxsetup.c',
+	'drivers/glide/fxtexman.c',
+	'drivers/glide/fxtris.c',
+	'drivers/glide/fxvb.c',
+	'drivers/glide/fxglidew.c',
+	'drivers/glide/fxg.c',
+]
+
+SVGA_DRIVER_SOURCES = [
+	'drivers/svga/svgamesa.c',
+	'drivers/svga/svgamesa8.c',
+	'drivers/svga/svgamesa15.c',
+	'drivers/svga/svgamesa16.c',
+	'drivers/svga/svgamesa24.c',
+	'drivers/svga/svgamesa32.c',
+]
+
+FBDEV_DRIVER_SOURCES = [
+	'drivers/fbdev/glfbdev.c',
+]
+
+
+### All the core C sources
+
+SOLO_SOURCES = \
+	MAIN_SOURCES + \
+	MATH_SOURCES + \
+	VBO_SOURCES + \
+	VF_SOURCES + \
+	DRAW_SOURCES + \
+	TGSIEXEC_SOURCES + \
+	TGSIUTIL_SOURCES + \
+	PIPEUTIL_SOURCES + \
+	STATECACHE_SOURCES + \
+	STATETRACKER_SOURCES + \
+	SHADER_SOURCES + \
+	ASM_SOURCES + \
+	SLANG_SOURCES
+
+CORE_SOURCES = \
+	GLAPI_SOURCES + API_SOURCES + \
+	SOLO_SOURCES
+
+ALL_SOURCES = \
+	GLAPI_SOURCES + API_SOURCES + \
+	SOLO_SOURCES + \
+	ASM_SOURCES + \
+	X11_DRIVER_SOURCES + \
+	FBDEV_DRIVER_SOURCES + \
+	OSMESA_DRIVER_SOURCES
+
+
+######################################################################
+# Gallium sources
+
+SConscript([
+	'pipe/SConscript',
+])
+
+
+######################################################################
+# libGL
+
+if not dri:
+	STAND_ALONE_DRIVER_SOURCES = \
+		CORE_SOURCES + \
+		X11_DRIVER_SOURCES
+	
+	Import(
+		'softpipe', 
+		'i915simple',
+		'i965simple'
+	)
+	
+	pipe_drivers = [
+		softpipe,
+		i965simple
+	]
+	
+	env.SharedLibrary(
+		target ='GL',
+		source = STAND_ALONE_DRIVER_SOURCES,
+		LIBS = [softpipe, i965simple] + env['LIBS'],
+	)
+
+
+######################################################################
+# Driver sources
+
+if dri:
+	mesa = env.ConvenienceLibrary(
+		target = 'mesa',
+		source = SOLO_SOURCES,
+	)
+	env.Prepend(LIBS = [mesa])
+
+	SConscript([
+		'drivers/dri/SConscript',
+	])
diff --git a/src/mesa/drivers/dri/SConscript b/src/mesa/drivers/dri/SConscript
new file mode 100644
index 0000000000..d32bd08669
--- /dev/null
+++ b/src/mesa/drivers/dri/SConscript
@@ -0,0 +1,48 @@
+Import('*')
+
+drienv = env.Clone()
+
+drienv.Replace(CPPPATH = [
+	'#src/mesa/drivers/dri/common',
+	'#include',
+	'#include/GL/internal',
+	'#src/mesa',
+	'#src/mesa/main',
+	'#src/mesa/glapi',
+	'#src/mesa/math',
+	'#src/mesa/transform',
+	'#src/mesa/shader',
+	'#src/mesa/swrast',
+	'#src/mesa/swrast_setup',
+	'#src/egl/main',
+	'#src/egl/drivers/dri',
+])
+
+drienv.ParseConfig('pkg-config --cflags --libs libdrm')
+
+COMMON_GALLIUM_SOURCES = [
+	'../common/utils.c',
+	'../common/vblank.c',
+	'../common/dri_util.c',
+	'../common/xmlconfig.c',
+]
+
+COMMON_BM_SOURCES = [
+	'../common/dri_bufmgr.c',
+	'../common/dri_drmpool.c',
+]
+
+Export([
+	'drienv',
+	'COMMON_GALLIUM_SOURCES',
+	'COMMON_BM_SOURCES',
+])
+
+# TODO: Installation
+#install: $(LIBNAME)
+#	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
+#	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
+
+SConscript([
+	'intel_winsys/SConscript',
+])
diff --git a/src/mesa/drivers/dri/intel_winsys/SConscript b/src/mesa/drivers/dri/intel_winsys/SConscript
new file mode 100644
index 0000000000..a7cc10450e
--- /dev/null
+++ b/src/mesa/drivers/dri/intel_winsys/SConscript
@@ -0,0 +1,41 @@
+Import('*')
+
+env = drienv.Clone()
+
+env.Append(CPPPATH = [
+	'../intel',
+	'server'
+])
+
+#MINIGLX_SOURCES = server/intel_dri.c
+
+pipe_drivers = [
+	softpipe,
+	i915simple
+]
+
+DRIVER_SOURCES = [
+	'intel_winsys_pipe.c',
+	'intel_winsys_softpipe.c',
+	'intel_winsys_i915.c',
+	'intel_batchbuffer.c',
+	'intel_swapbuffers.c',
+	'intel_context.c',
+	'intel_lock.c',
+	'intel_screen.c',
+	'intel_batchpool.c',
+]
+
+sources = \
+	COMMON_GALLIUM_SOURCES + \
+	COMMON_BM_SOURCES + \
+	DRIVER_SOURCES
+
+# DRIVER_DEFINES = -I../intel $(shell pkg-config libdrm --atleast-version=2.3.1 \
+#				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
+
+env.SharedLibrary(
+	target ='i915tex_dri.so',
+	source = sources,
+	LIBS = pipe_drivers + env['LIBS'],
+)
\ No newline at end of file
diff --git a/src/mesa/pipe/SConscript b/src/mesa/pipe/SConscript
new file mode 100644
index 0000000000..d9c20e0100
--- /dev/null
+++ b/src/mesa/pipe/SConscript
@@ -0,0 +1,9 @@
+Import('*')
+
+#env = env.Clone()
+
+SConscript([
+	'softpipe/SConscript',
+	'i915simple/SConscript',
+	'i965simple/SConscript',
+])
diff --git a/src/mesa/pipe/i915simple/SConscript b/src/mesa/pipe/i915simple/SConscript
new file mode 100644
index 0000000000..f5fb96b995
--- /dev/null
+++ b/src/mesa/pipe/i915simple/SConscript
@@ -0,0 +1,29 @@
+Import('*')
+
+env = env.Clone()
+
+i915simple = env.ConvenienceLibrary(
+	target = 'i915simple',
+	source = [
+		'i915_blit.c',
+		'i915_clear.c',
+		'i915_context.c',
+		'i915_debug.c',
+		'i915_debug_fp.c',
+		'i915_flush.c',
+		'i915_fpc_emit.c',
+		'i915_fpc_translate.c',
+		'i915_prim_emit.c',
+		'i915_prim_vbuf.c',
+		'i915_state.c',
+		'i915_state_derived.c',
+		'i915_state_dynamic.c',
+		'i915_state_emit.c',
+		'i915_state_immediate.c',
+		'i915_state_sampler.c',
+		'i915_strings.c',
+		'i915_surface.c',
+		'i915_texture.c',
+	])
+
+Export('i915simple')
diff --git a/src/mesa/pipe/i965simple/SConscript b/src/mesa/pipe/i965simple/SConscript
new file mode 100644
index 0000000000..74621de84c
--- /dev/null
+++ b/src/mesa/pipe/i965simple/SConscript
@@ -0,0 +1,55 @@
+Import('*')
+
+env = env.Clone()
+
+i965simple = env.ConvenienceLibrary(
+	target = 'i965simple',
+	source = [
+		'brw_blit.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_flush.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_shader_info.c',
+		'brw_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+		'brw_state_pool.c',
+		'brw_state_upload.c',
+		'brw_strings.c',
+		'brw_surface.c',
+		'brw_tex_layout.c',
+		'brw_urb.c',
+		'brw_util.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_wm.c',
+		'brw_wm_decl.c',
+		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+	])
+
+Export('i965simple')
diff --git a/src/mesa/pipe/softpipe/SConscript b/src/mesa/pipe/softpipe/SConscript
new file mode 100644
index 0000000000..d581ee8d3c
--- /dev/null
+++ b/src/mesa/pipe/softpipe/SConscript
@@ -0,0 +1,42 @@
+Import('*')
+
+env = env.Clone()
+
+softpipe = env.ConvenienceLibrary(
+	target = 'softpipe',
+	source = [
+		'sp_clear.c',
+		'sp_context.c',
+		'sp_draw_arrays.c',
+		'sp_flush.c',
+		'sp_prim_setup.c',
+		'sp_prim_vbuf.c',
+		'sp_quad_alpha_test.c',
+		'sp_quad_blend.c',
+		'sp_quad_bufloop.c',
+		'sp_quad.c',
+		'sp_quad_colormask.c',
+		'sp_quad_coverage.c',
+		'sp_quad_depth_test.c',
+		'sp_quad_earlyz.c',
+		'sp_quad_fs.c',
+		'sp_quad_occlusion.c',
+		'sp_quad_output.c',
+		'sp_quad_stencil.c',
+		'sp_quad_stipple.c',
+		'sp_query.c',
+		'sp_state_blend.c',
+		'sp_state_clip.c',
+		'sp_state_derived.c',
+		'sp_state_fs.c',
+		'sp_state_rasterizer.c',
+		'sp_state_sampler.c',
+		'sp_state_surface.c',
+		'sp_state_vertex.c',
+		'sp_surface.c',
+		'sp_tex_sample.c',
+		'sp_texture.c',
+		'sp_tile_cache.c',
+	])
+
+Export('softpipe')
\ No newline at end of file
-- 
cgit v1.2.3


From e11bfc990708326faf3db3db12f7941a1a3c6e0f Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 31 Jan 2008 14:21:49 +0900
Subject: gallium: Make the build output dir depend on the configuration.

The build output dirs mimics the old config names:

  build/linux
  build/linux-dri
  build/linux-dri-x86
  build/linux-dri-x86-debug
  ...
---
 SConstruct | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/SConstruct b/SConstruct
index db6161ed51..01732b0c52 100644
--- a/SConstruct
+++ b/SConstruct
@@ -2,6 +2,7 @@
 # Top-level SConstruct
 
 import os
+import os.path
 import sys
 
 
@@ -40,7 +41,7 @@ Help(opts.GenerateHelpText(env))
 # for debugging
 #print env.Dump()
 
-if 1:
+if 0:
 	# platform will be typically 'posix' or 'win32' 
 	platform = env['PLATFORM']
 else:
@@ -56,7 +57,7 @@ machine = env['machine']
 
 # derived options
 x86 = machine == 'x86'
-gcc = platform == 'posix'
+gcc = platform in ('posix', 'linux', 'freebsd', 'darwin')
 msvc = platform == 'win32'
 
 Export([
@@ -202,10 +203,20 @@ createConvenienceLibBuilder(env)
 #######################################################################
 # Invoke SConscripts
 
-# Put build output in a separate dir
-# TODO: make build_dir depend on platform and build type (check  
-#       http://www.scons.org/wiki/AdvancedBuildExample for an example)
-build_dir = 'build'
+# Put build output in a separate dir, which depends on the current configuration
+# See also http://www.scons.org/wiki/AdvancedBuildExample
+build_topdir = 'build'
+build_subdir = platform
+if dri:
+	build_subdir += "-dri"
+if x86:
+	build_subdir += "-x86"
+if debug:
+	build_subdir += "-debug"
+build_dir = os.path.join(build_topdir, build_subdir)
+
+# TODO: Build several variants at the same time?
+# http://www.scons.org/wiki/SimultaneousVariantBuilds
 
 SConscript(
 	'src/mesa/SConscript',
-- 
cgit v1.2.3


From 526bed2b956d88e1b371b2e43402f8a160113497 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 31 Jan 2008 14:26:39 +0900
Subject: gallium: Portability fixes.

---
 src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
index a2657dac59..b6af7cdedc 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
@@ -367,7 +367,7 @@ struct mm_pb_manager
 };
 
 
-static inline struct mm_pb_manager *
+static INLINE struct mm_pb_manager *
 mm_pb_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -385,7 +385,7 @@ struct mm_buffer
 };
 
 
-static inline struct mm_buffer *
+static INLINE struct mm_buffer *
 mm_buffer(struct pb_buffer *buf)
 {
    assert(buf);
-- 
cgit v1.2.3


From c7403b18476c9a8f767fa7d33cb1efd643b30dd6 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 31 Jan 2008 11:57:15 +0000
Subject: tgsi: Use ESI instead of EBX as temp reg on non-win32

---
 src/mesa/pipe/tgsi/exec/tgsi_sse2.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
index f8660e7ad1..df0c698301 100755
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -198,9 +198,15 @@ get_output_base( void )
 static struct x86_reg
 get_temp_base( void )
 {
+#ifdef WIN32
    return x86_make_reg(
       file_REG32,
       reg_BX );
+#else
+   return x86_make_reg(
+      file_REG32,
+      reg_SI );
+#endif
 }
 
 static struct x86_reg
-- 
cgit v1.2.3


From 4d3f3f749f0675c07de2809a9c0dc843f29e9873 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 08:12:47 -0700
Subject: Cell: set GALLIUM_CELL_VS env var to enable SPU-based vertex
 transformation

---
 src/mesa/pipe/cell/ppu/cell_context.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index 4885cd0d2c..bbe1fd7a11 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -162,8 +162,12 @@ cell_draw_create(struct cell_context *cell)
 {
    struct draw_context *draw = draw_create();
 
-   draw->shader_queue_flush = cell_vertex_shader_queue_flush;
-   draw->driver_private = cell;
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+
    return draw;
 }
 
-- 
cgit v1.2.3


From 7978c749fb2a267e9575c1280557da4cd33e2380 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 08:21:38 -0700
Subject: Cell: SIMD-ize const_coeff()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index e436e153ec..08b8bf0c9c 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -723,24 +723,18 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex->data[slot][i].
- * The result will be put into setup.coef[slot].a0[i].
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
  * \param slot  which attribute slot 
- * \param i  which component of the slot (0..3)
  */
-static void const_coeff(uint slot)
+static INLINE void const_coeff(uint slot)
 {
-   uint i;
-   ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-   for (i = 0; i < 4; i++) {
-      setup.coef[slot].dadx.f[i] = 0;
-      setup.coef[slot].dady.f[i] = 0;
-
-      /* need provoking vertex info!
-       */
-      setup.coef[slot].a0.f[i] = setup.vprovoke->data[slot][i];
-   }
+   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0.f[0] = setup.vprovoke->data[slot][0];
+   setup.coef[slot].a0.f[1] = setup.vprovoke->data[slot][1];
+   setup.coef[slot].a0.f[2] = setup.vprovoke->data[slot][2];
+   setup.coef[slot].a0.f[3] = setup.vprovoke->data[slot][3];
 }
 
 
-- 
cgit v1.2.3


From 6c59de9a7bcc025ba070b854d79bf4fb8dcccabf Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:32:29 -0700
Subject: gallium: fix get/put typo regression

This came from commit f3aa4de034b0d791ce2e38e8aeb3b3abdb4e3b50 on 1/22/08.
Fixes strange Z buffer glitches seen in progs/glsl/texdemo1.c
---
 src/mesa/pipe/softpipe/sp_tile_cache.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/pipe/softpipe/sp_tile_cache.c b/src/mesa/pipe/softpipe/sp_tile_cache.c
index 451e157abf..ccf367a5e4 100644
--- a/src/mesa/pipe/softpipe/sp_tile_cache.c
+++ b/src/mesa/pipe/softpipe/sp_tile_cache.c
@@ -415,8 +415,8 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
             pipe_put_tile_raw(pipe, ps,
-                           tile->x, tile->y, TILE_SIZE, TILE_SIZE,
-                           tile->data.depth32, 0/*STRIDE*/);
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_put_tile_rgba(pipe, ps,
@@ -441,9 +441,9 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
       else {
          /* get new tile data from surface */
          if (tc->depth_stencil) {
-            pipe_put_tile_raw(pipe, ps,
-                           tile->x, tile->y, TILE_SIZE, TILE_SIZE,
-                           tile->data.depth32, 0/*STRIDE*/);
+            pipe_get_tile_raw(pipe, ps,
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_get_tile_rgba(pipe, ps,
-- 
cgit v1.2.3


From 5ee218a02186188f5819e5a4e1e15296bbd57322 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:36:00 -0700
Subject: gallium: Fix z clear bug when TILE_CLEAR_OPTIMIZATION==0

---
 src/mesa/pipe/softpipe/sp_clear.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/pipe/softpipe/sp_clear.c b/src/mesa/pipe/softpipe/sp_clear.c
index 571f64b38d..8d295a30ca 100644
--- a/src/mesa/pipe/softpipe/sp_clear.c
+++ b/src/mesa/pipe/softpipe/sp_clear.c
@@ -55,7 +55,9 @@ softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
 
    if (ps == sp_tile_cache_get_surface(softpipe->zsbuf_cache)) {
       sp_tile_cache_clear(softpipe->zsbuf_cache, clearValue);
+#if TILE_CLEAR_OPTIMIZATION
       return;
+#endif
    }
 
    for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
-- 
cgit v1.2.3


From 26fff001e786d88041d9db4c35949b50849f6a59 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:37:01 -0700
Subject: gallium: comments about fragment Z computation

---
 src/mesa/pipe/softpipe/sp_quad_fs.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c
index c9cc8afa0c..90691c6065 100644
--- a/src/mesa/pipe/softpipe/sp_quad_fs.c
+++ b/src/mesa/pipe/softpipe/sp_quad_fs.c
@@ -168,6 +168,11 @@ shade_quad(
              sizeof( quad->outputs.color ) );
    }
 
+   /*
+    * XXX the following code for updating quad->outputs.depth
+    * isn't really needed if we did early z testing.
+    */
+
    /* store result Z */
    if (qss->depthOutSlot >= 0) {
       /* output[slot] is new Z */
@@ -181,6 +186,10 @@ shade_quad(
       uint i;
       for (i = 0; i < 4; i++) {
          quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i];
+         /* XXX not sure the above line is always correct.  The following
+          * might be better:
+         quad->outputs.depth[i] = machine->QuadPos.xyzw[2].f[i];
+          */
       }
    }
 
-- 
cgit v1.2.3


From a4c7c8a6ee15d793d08e448f9ca8e2100bbe748c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 14:05:04 -0700
Subject: Fix problem in mapping vertex program outputs (found with "spring"
 game engine)

If the vertex program writes to an output that's not consumed by the
fragment program, map the vp output to an unused slot.
---
 src/mesa/state_tracker/st_atom_shader.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 1ed9333556..9196918509 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -226,9 +226,11 @@ find_translated_vp(struct st_context *st,
             GLint fpInAttrib = vp_out_to_fp_in(outAttr);
             if (fpInAttrib >= 0) {
                GLuint fpInSlot = stfp->input_to_slot[fpInAttrib];
-               GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
-               xvp->output_to_slot[outAttr] = vpOutSlot;
-               numVpOuts++;
+               if (fpInSlot != ~0) {
+                  GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
+                  xvp->output_to_slot[outAttr] = vpOutSlot;
+                  numVpOuts++;
+               }
             }
             else if (outAttr == VERT_RESULT_PSIZ ||
                      outAttr == VERT_RESULT_BFC0 ||
@@ -247,7 +249,7 @@ find_translated_vp(struct st_context *st,
        * We could use this info to do dead code elimination in the
        * vertex program.
        */
-      dummySlot = stfp->num_input_slots;
+      dummySlot = numVpOuts;
 
       /* Map vert program outputs that aren't used to the dummy slot */
       for (outAttr = 0; outAttr < VERT_RESULT_MAX; outAttr++) {
-- 
cgit v1.2.3


From 9aa37ad5401959fb43d39724d6efb5088087e56e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 14:35:25 -0700
Subject: gallium: fix problem in which texcoords and varying vars got mapped
 to the same slot

This fixes the glsl/bump.c and glsl/texdemo1.c programs
---
 src/mesa/state_tracker/st_program.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 1f1e6500e0..84a9094001 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -33,6 +33,7 @@
 
 #include "main/imports.h"
 #include "main/mtypes.h"
+#include "shader/prog_print.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -68,6 +69,7 @@ st_translate_vertex_program(struct st_context *st,
    struct pipe_shader_state vs;
    const struct cso_vertex_shader *cso;
    GLuint attr, i;
+   GLuint num_generic = 0;
 
    memset(&vs, 0, sizeof(vs));
 
@@ -117,7 +119,7 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_ATTRIB_TEX6:
          case VERT_ATTRIB_TEX7:
             vs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.input_semantic_index[slot] = attr - VERT_ATTRIB_TEX0;
+            vs.input_semantic_index[slot] = num_generic++;
             break;
          case VERT_ATTRIB_GENERIC0:
          case VERT_ATTRIB_GENERIC1:
@@ -129,7 +131,7 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_ATTRIB_GENERIC7:
             assert(attr < VERT_ATTRIB_MAX);
             vs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.input_semantic_index[slot] = attr - VERT_ATTRIB_GENERIC0;
+            vs.input_semantic_index[slot] = num_generic++;
             break;
          default:
             assert(0);
@@ -143,6 +145,7 @@ st_translate_vertex_program(struct st_context *st,
       vs.output_semantic_index[i] = 0;
    }
 
+   num_generic = 0;
    /*
     * Determine number of outputs, the (default) output register
     * mapping and the semantic information for each output.
@@ -207,14 +210,14 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_RESULT_TEX6:
          case VERT_RESULT_TEX7:
             vs.output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.output_semantic_index[slot] = attr - VERT_RESULT_TEX0;
+            vs.output_semantic_index[slot] = num_generic++;
             break;
          case VERT_RESULT_VAR0:
             /* fall-through */
          default:
             assert(attr - VERT_RESULT_VAR0 < MAX_VARYING);
             vs.output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.output_semantic_index[slot] = attr - VERT_RESULT_VAR0;
+            vs.output_semantic_index[slot] = num_generic++;
          }
       }
    }
@@ -258,6 +261,9 @@ st_translate_vertex_program(struct st_context *st,
    cso = st_cached_vs_state(st, &vs);
    stvp->cso = cso;
 
+   if (0)
+      _mesa_print_program(&stvp->Base.Base);
+
    if (TGSI_DEBUG)
       tgsi_dump( tokensOut, 0 );
 }
@@ -286,6 +292,7 @@ st_translate_fragment_program(struct st_context *st,
    GLuint attr;
    const GLbitfield inputsRead = stfp->Base.Base.InputsRead;
    GLuint vslot = 0;
+   GLuint num_generic = 0;
 
    memset(&fs, 0, sizeof(fs));
 
@@ -338,14 +345,14 @@ st_translate_fragment_program(struct st_context *st,
          case FRAG_ATTRIB_TEX6:
          case FRAG_ATTRIB_TEX7:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            fs.input_semantic_index[slot] = attr - FRAG_ATTRIB_TEX0;
+            fs.input_semantic_index[slot] = num_generic++;
             interpMode[slot] = TGSI_INTERPOLATE_PERSPECTIVE;
             break;
          case FRAG_ATTRIB_VAR0:
             /* fall-through */
          default:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            fs.input_semantic_index[slot] = attr - FRAG_ATTRIB_VAR0;
+            fs.input_semantic_index[slot] = num_generic++;
             interpMode[slot] = TGSI_INTERPOLATE_PERSPECTIVE;
          }
       }
@@ -415,6 +422,9 @@ st_translate_fragment_program(struct st_context *st,
    cso = st_cached_fs_state(st, &fs);
    stfp->fs = cso;
 
+   if (0)
+      _mesa_print_program(&stfp->Base.Base);
+
    if (TGSI_DEBUG)
       tgsi_dump( tokensOut, 0/*TGSI_DUMP_VERBOSE*/ );
 
-- 
cgit v1.2.3


From 41d1179fa68fd8987cd09f26f32416963d235744 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 17:05:43 -0700
Subject: fix typo

---
 src/mesa/pipe/p_shader_tokens.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/p_shader_tokens.h b/src/mesa/pipe/p_shader_tokens.h
index e9d1d66bda..3ce35310f6 100644
--- a/src/mesa/pipe/p_shader_tokens.h
+++ b/src/mesa/pipe/p_shader_tokens.h
@@ -626,7 +626,7 @@ struct tgsi_src_register_ext
 
 /*
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
- * it should be cast to tgsi_src_register_ext_extswz.
+ * it should be cast to tgsi_src_register_ext_swz.
  * 
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_MOD,
  * it should be cast to tgsi_src_register_ext_mod.
-- 
cgit v1.2.3


From 0c0c62dd2a7d8d5f8bbd91bc7443f27abb059ad4 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 31 Jan 2008 17:22:07 -0800
Subject: Fix using "ccache ppu-gcc" for CC and fix parallel builds

CC wasn't quoted in a couple places in src/mesa/Makefile.  Also, the
OSMesa link was missing a dependency.
---
 src/mesa/Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index b16d74bf49..720f1b2e02 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -125,24 +125,25 @@ osmesa-only: depend subdirs $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
 # Make the GL library
 $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME): $(STAND_ALONE_OBJECTS) $(PIPE_LIB) $(CELL_LIB) $(CELL_LIB_SPU) $(LLVM_LIB)
 	@ $(TOP)/bin/mklib -o $(GL_LIB) \
-		-linker $(CC) \
+		-linker "$(CC)" \
 		-major $(GL_MAJOR) -minor $(GL_MINOR) -patch $(GL_TINY) \
 		-install $(TOP)/$(LIB_DIR) \
 		$(MKLIB_OPTIONS) $(STAND_ALONE_OBJECTS) \
 		$(PIPE_LIB) $(CELL_LIB) $(CELL_LIB_SPU) $(LLVM_LIB) $(GL_LIB_DEPS)
 
 # Make the OSMesa library
-$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OSMESA_DRIVER_OBJECTS) $(OSMESA16_OBJECTS)
+$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OSMESA_DRIVER_OBJECTS) \
+		$(OSMESA16_OBJECTS) $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME)
 	@ if [ "${DRIVER_DIRS}" = "osmesa" ] ; then \
 		$(TOP)/bin/mklib -o $(OSMESA_LIB) \
-			-linker $(CC) \
+			-linker "$(CC)" \
 			-major $(MESA_MAJOR) \
 			-minor $(MESA_MINOR) -patch $(MESA_TINY) \
 			-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
 			$(OSMESA_LIB_DEPS) $(OSMESA16_OBJECTS) ; \
 	else \
 		$(TOP)/bin/mklib -o $(OSMESA_LIB) \
-			-linker $(CC) \
+			-linker "$(CC)" \
 			-major $(MESA_MAJOR) \
 			-minor $(MESA_MINOR) -patch $(GL_TINY) \
 			-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
-- 
cgit v1.2.3


From 4f7dcb0e04cbdb95684bf415133c0e7861839f96 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 09:27:57 -0700
Subject: Cell: store current tile status in cur_tile_status_c/z, add
 TILE_STATUS_GETTING

---
 src/mesa/pipe/cell/spu/spu_render.c | 36 ++++++++++++++++-----
 src/mesa/pipe/cell/spu/spu_tile.c   |  1 +
 src/mesa/pipe/cell/spu/spu_tile.h   |  8 +++--
 src/mesa/pipe/cell/spu/spu_tri.c    | 62 ++++++++++++++++++++++++++++++-------
 src/mesa/pipe/cell/spu/spu_tri.h    |  2 +-
 5 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index f506095116..ca54a103bd 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -95,13 +95,15 @@ static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
-      if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+      if (cur_tile_status_z != TILE_STATUS_CLEAR) {
          get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+         cur_tile_status_z = TILE_STATUS_GETTING;
       }
    }
 
-   if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+   if (cur_tile_status_c != TILE_STATUS_CLEAR) {
       get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+      cur_tile_status_c = TILE_STATUS_GETTING;
    }
 }
 
@@ -112,14 +114,24 @@ get_cz_tiles(uint tx, uint ty)
 static INLINE void
 put_cz_tiles(uint tx, uint ty)
 {
-   if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+   if (cur_tile_status_z == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
       put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-      tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+      cur_tile_status_z = TILE_STATUS_DEFINED;
+   }
+   else if (cur_tile_status_z == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      cur_tile_status_z = TILE_STATUS_DEFINED;
    }
 
-   if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+   if (cur_tile_status_c == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
       put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-      tile_status[ty][tx] = TILE_STATUS_DEFINED;
+      cur_tile_status_c = TILE_STATUS_DEFINED;
+   }
+   else if (cur_tile_status_c == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      cur_tile_status_c = TILE_STATUS_DEFINED;
    }
 }
 
@@ -238,8 +250,13 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      cur_tile_status_c = tile_status[ty][tx];
+      cur_tile_status_z = tile_status_z[ty][tx];
+
       get_cz_tiles(tx, ty);
 
+      uint drawn = 0;
+
       /* loop over tris */
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
@@ -248,13 +265,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty);
       }
 
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
       /* write color/z tiles back to main framebuffer, if dirtied */
       put_cz_tiles(tx, ty);
 
       wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      tile_status[ty][tx] = cur_tile_status_c;
+      tile_status_z[ty][tx] = cur_tile_status_z;
    }
 
    if (Debug)
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index ca1352f9f8..aea4785bc2 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -37,6 +37,7 @@ tile_t ztile ALIGN16_ATTRIB;
 ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+ubyte cur_tile_status_c, cur_tile_status_z;
 
 
 void
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 18d1b3c117..1f123a2b7b 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -51,12 +51,16 @@ extern tile_t ztile ALIGN16_ATTRIB;
 
 
 #define TILE_STATUS_CLEAR   1
-#define TILE_STATUS_DEFINED 2  /**< defined pixel data */
-#define TILE_STATUS_DIRTY   3  /**< modified, but not put back yet */
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+extern ubyte cur_tile_status_c, cur_tile_status_z;
+
 
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 08b8bf0c9c..a32878d917 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -299,16 +299,23 @@ do_depth_test(int x, int y, unsigned int mask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+   if (cur_tile_status_c == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
+      cur_tile_status_z = TILE_STATUS_DIRTY;
    }
-   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+
+#if 0
+   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
+      /* now, _really_ clear the tile */
+      clear_z_tile(&ztile);
+   }
+   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
-
+   cur_tile_status_z = TILE_STATUS_DIRTY;
+#endif
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       zvals.v = spu_mul(zvals.v, zscale16.v);
@@ -380,6 +387,9 @@ do_depth_test(int x, int y, unsigned int mask)
       }
    }
 
+   if (mask)
+      cur_tile_status_z = TILE_STATUS_DIRTY;
+
    return mask;
 }
 
@@ -397,15 +407,15 @@ do_depth_test_simd(int x, int y, vector unsigned int quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+   cur_tile_status_z = TILE_STATUS_DIRTY;
 
    /* XXX fetch Z value sooner to hide latency here */
    zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
@@ -462,15 +472,23 @@ emit_quad( int x, int y, mask_t mask )
    if (mask)
 #endif
    {
-      if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else if (tile_status[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+
+#if 0
+      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+         /* now, _really_ clear the tile */
+         clear_c_tile(&ctile);
+         cur_tile_status_c = TILE_STATUS_DIRTY;
+      }
+      else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-      tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+#endif
+      cur_tile_status_c = TILE_STATUS_DIRTY;
 
 #if SIMD_Z
       if (spu_extract(mask, 0))
@@ -970,7 +988,7 @@ static void subtriangle( struct edge *eleft,
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
-void
+boolean
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
    setup.tx = tx;
@@ -985,7 +1003,7 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
-      return; /* totally clipped */
+      return FALSE; /* totally clipped */
    }
 
    setup_tri_coefficients();
@@ -999,6 +1017,24 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
+   if (cur_tile_status_c == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      cur_tile_status_c = TILE_STATUS_CLEAN;
+   }
+
+   ASSERT(cur_tile_status_c != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (cur_tile_status_z == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         cur_tile_status_z = TILE_STATUS_CLEAN;
+      }
+   ASSERT(cur_tile_status_z != TILE_STATUS_DEFINED);
+   }
+
+
    if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
@@ -1013,4 +1049,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    }
 
    flush_spans();
+
+   return TRUE;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_tri.h b/src/mesa/pipe/cell/spu/spu_tri.h
index 86c42b6339..aa694dd7c9 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.h
+++ b/src/mesa/pipe/cell/spu/spu_tri.h
@@ -30,7 +30,7 @@
 #define SPU_TRI_H
 
 
-extern void
+extern boolean
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
-- 
cgit v1.2.3


From 7a0099b9f3f4cbdb0893a3f11da84326dcf86179 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 13:45:58 -0700
Subject: Cell: implement Z16 and Z32 testing with SIMD instructions.

---
 src/mesa/pipe/cell/spu/spu_tile.h  |   3 +-
 src/mesa/pipe/cell/spu/spu_tri.c   | 222 +++++--------------------------------
 src/mesa/pipe/cell/spu/spu_ztest.h | 135 ++++++++++++++++++++++
 3 files changed, 163 insertions(+), 197 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_ztest.h

diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 1f123a2b7b..4b1ef2a4c8 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,7 +42,8 @@
 typedef union {
    ushort t16[TILE_SIZE][TILE_SIZE];
    uint   t32[TILE_SIZE][TILE_SIZE];
-   float4 f4[TILE_SIZE/2][TILE_SIZE/2];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a32878d917..a26a4f098d 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -39,18 +39,11 @@
 #include "spu_tile.h"
 #include "spu_tri.h"
 
+#include "spu_ztest.h"
 
-/*
- * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
- * to do Z testing/updating.
- */
-#define SIMD_Z 0
 
-#if SIMD_Z
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
-#else
-typedef uint mask_t;
-#endif
 
 
 /**
@@ -282,20 +275,11 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
 }
 
 
-
-static unsigned int
-do_depth_test(int x, int y, unsigned int mask)
+static INLINE mask_t
+do_depth_test(int x, int y, mask_t quadmask)
 {
-   static const float4 zscale16
-      = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
-   static const float4 zscale32
-      = {.f={(float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff}};
-   int ix = x - setup.cliprect_minx;
-   int iy = y - setup.cliprect_miny;
    float4 zvals;
+   mask_t mask;
 
    zvals.v = eval_z((float) x, (float) y);
 
@@ -305,129 +289,20 @@ do_depth_test(int x, int y, unsigned int mask)
       cur_tile_status_z = TILE_STATUS_DIRTY;
    }
 
-#if 0
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-#endif
-
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      zvals.v = spu_mul(zvals.v, zscale16.v);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t16[iy][ix])
-            ztile.t16[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t16[iy][ix+1])
-            ztile.t16[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t16[iy+1][ix])
-            ztile.t16[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t16[iy+1][ix+1])
-            ztile.t16[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 4;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
-      zvals.v = spu_mul(zvals.v, zscale32.v);
-      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t32[iy][ix])
-            ztile.t32[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t32[iy][ix+1])
-            ztile.t32[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t32[iy+1][ix])
-            ztile.t32[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t32[iy+1][ix+1])
-            ztile.t32[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 2;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
    }
-
-   if (mask)
-      cur_tile_status_z = TILE_STATUS_DIRTY;
-
    return mask;
 }
 
 
-
-
-static vector unsigned int
-do_depth_test_simd(int x, int y, vector unsigned int quadmask)
-{
-   int ix = (x - setup.cliprect_minx) / 2;
-   int iy = (y - setup.cliprect_miny) / 2;
-   float4 zvals;
-
-   vector unsigned int zmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-
-   /* XXX fetch Z value sooner to hide latency here */
-   zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
-   zmask = spu_and(zmask, quadmask);
-
-   ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
-   //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
-
-   return zmask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
@@ -461,36 +336,18 @@ emit_quad( int x, int y, mask_t mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-#if SIMD_Z
-      mask = do_depth_test_simd(x, y, mask);
-#else
       mask = do_depth_test(x, y, mask);
-#endif
    }
 
-#if !SIMD_Z
-   if (mask)
-#endif
-   {
-      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
-      }
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
 
-#if 0
       if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
-         cur_tile_status_c = TILE_STATUS_DIRTY;
       }
-      else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
-         /* make sure we've got the tile from main mem */
-         wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      }
-#endif
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
-#if SIMD_Z
       if (spu_extract(mask, 0))
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
@@ -499,20 +356,11 @@ emit_quad( int x, int y, mask_t mask )
          ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
          ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#elif 0
+
+#if 0
       /* SIMD_Z with swizzled color buffer (someday) */
       vector float icolors = *((vector float *) &colors);
       ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
-
-#else
-      if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
-      if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
-      if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
-      if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 #endif
    }
 
@@ -533,38 +381,20 @@ static INLINE int block( int x )
 /**
  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
  * the triangle's bounds.
- *
- * this is pretty nasty...  may need to rework flush_spans again to
- * fix it, if possible.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static mask_t calculate_mask( int x )
+static INLINE mask_t calculate_mask( int x )
 {
-#if SIMD_Z
-   uint m0, m1, m2, m3;
-
-   m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
-   m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
-   m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
-   m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
-
-   return (vector unsigned int) {m0, m1, m2, m3};
-#else
-   unsigned mask = 0x0;
-
-   if (x >= setup.span.left[0] && x < setup.span.right[0]) 
-      mask |= MASK_TOP_LEFT;
-
-   if (x >= setup.span.left[1] && x < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_LEFT;
-      
-   if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) 
-      mask |= MASK_TOP_RIGHT;
-
-   if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_RIGHT;
-
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
    return mask;
-#endif
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
new file mode 100644
index 0000000000..5fefb15176
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Zbuffer/depth test code.
+ */
+
+
+#ifndef SPU_ZTEST_H
+#define SPU_ZTEST_H
+
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+
+
+/**
+ * Perform Z testing for a 16-bit/value Z buffer.
+ *
+ * \param zvals  vector of four fragment zvalues as floats
+ * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
+ *               contains the Z values for 2 quads, 8 pixels.
+ * \param x      x coordinate of quad (only lsbit is significant)
+ * \param inMask indicates which fragments in the quad are alive
+ * \return new mask indicating which fragments are alive after ztest
+ */
+static INLINE vector unsigned int
+spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
+                  uint x, vector unsigned int inMask)
+{
+#define ZERO 0x80
+   vector unsigned int zvals_ui4, zbuf_ui4, mask;
+
+   /* convert floats to uints in [0, 65535] */
+   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
+   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
+
+   /* XXX this conditional could be removed with a bit of work */
+   if (x & 1) {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather lower four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                      ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                      ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 16, 17, 18, 19, 20, 21, 22, 23,
+                                 2, 3, 6, 7, 10, 11, 14, 15));
+   }
+   else {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather upper four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 2, 3, 6, 7, 10, 11, 14, 15,
+                                 24, 25, 26, 27, 28, 29, 30, 31));
+   }
+   return mask;
+#undef ZERO
+}
+
+
+/**
+ * As above, but Zbuffer values as 32-bit uints
+ */
+static INLINE vector unsigned int
+spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
+                  vector unsigned int inMask)
+{
+   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
+
+   /* convert floats to uints in [0, 0xffffffff] */
+   zvals_ui4 = spu_convtu(zvals, 32);
+   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
+   mask = spu_cmpgt(zbuf, zvals_ui4);
+   /* mask &= inMask */
+   mask = spu_and(mask, inMask);
+   /* zbuf = mask ? zval : zbuf */
+   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
+
+   return mask;
+}
+
+
+#endif /* SPU_ZTEST_H */
-- 
cgit v1.2.3


From a3d5d7067e9928d64fef08893c0a0368c8d83e45 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 13:49:51 -0700
Subject: Cell: rename fields of the tile_t union

---
 src/mesa/pipe/cell/spu/spu_main.c    |  8 ++++++++
 src/mesa/pipe/cell/spu/spu_texture.c |  6 +++---
 src/mesa/pipe/cell/spu/spu_tile.c    |  4 ++--
 src/mesa/pipe/cell/spu/spu_tile.h    | 18 +++++-------------
 src/mesa/pipe/cell/spu/spu_tri.c     |  8 ++++----
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index d6393048f5..7d6e910ad5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -36,6 +36,7 @@
 #include "spu_render.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+//#include "spu_test.h"
 #include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
@@ -495,6 +496,7 @@ one_time_init(void)
 }
 
 
+
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -515,6 +517,8 @@ main(main_param_t speid, main_param_t argp)
 
    (void) speid;
 
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+
    one_time_init();
 
    if (Debug)
@@ -528,6 +532,10 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc();
+#endif
 
    main_loop();
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 7a1ca097c0..c1dc6bfe90 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -97,10 +97,10 @@ get_tex_tile(uint i, uint j)
              spu.init.id, src, tex_tiles[pos].t32);
 #endif
 
-      ASSERT_ALIGN16(tex_tiles[pos].t32);
+      ASSERT_ALIGN16(tex_tiles[pos].ui);
       ASSERT_ALIGN16(src);
 
-      mfc_get(tex_tiles[pos].t32,  /* dest */
+      mfc_get(tex_tiles[pos].ui,  /* dest */
               (unsigned int) src,
               bytes_per_tile,      /* size */
               TAG_TEXTURE_TILE,
@@ -134,6 +134,6 @@ sample_texture(float4 texcoord)
    uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
    uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
+   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index aea4785bc2..fd65c2b49c 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -56,7 +56,7 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
    printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
           tile, (unsigned int) src, bytesPerTile);
    */
-   mfc_get(tile->t32,  /* dest in local memory */
+   mfc_get(tile->ui,  /* dest in local memory */
            (unsigned int) src, /* src in main memory */
            bytesPerTile,
            tag,
@@ -82,7 +82,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
           spu.init.id,
           tile, (unsigned int) dst, bytesPerTile);
    */
-   mfc_put((void *) tile->t32,  /* src in local memory */
+   mfc_put((void *) tile->ui,  /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            bytesPerTile,
            tag,
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 4b1ef2a4c8..85a0d55807 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -40,8 +40,8 @@
 
 
 typedef union {
-   ushort t16[TILE_SIZE][TILE_SIZE];
-   uint   t32[TILE_SIZE][TILE_SIZE];
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
    vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
    vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
@@ -74,7 +74,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 static INLINE void
 clear_c_tile(tile_t *ctile)
 {
-   memset32((uint*) ctile->t32,
+   memset32((uint*) ctile->ui,
             spu.fb.color_clear_value,
             TILE_SIZE * TILE_SIZE);
 }
@@ -84,23 +84,15 @@ static INLINE void
 clear_z_tile(tile_t *ztile)
 {
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      memset16((ushort*) ztile->t16,
+      memset16((ushort*) ztile->us,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
    }
    else {
       ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-#if SIMD_Z
-      union fi z;
-      z.f = 1.0;
-      memset32((uint*) ztile->t32,
-               z.i,/*spu.fb.depth_clear_value,*/
-               TILE_SIZE * TILE_SIZE);
-#else
-      memset32((uint*) ztile->t32,
+      memset32((uint*) ztile->ui,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
-#endif
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a26a4f098d..b04b6841c0 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -349,13 +349,13 @@ emit_quad( int x, int y, mask_t mask )
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
       if (spu_extract(mask, 0))
-         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
+         ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
-         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+         ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (spu_extract(mask, 2))
-         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+         ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
-         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+         ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 
 #if 0
       /* SIMD_Z with swizzled color buffer (someday) */
-- 
cgit v1.2.3


From 1b6b5db4e2e891c62cfb868f2c6ae7f27b29709d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 14:02:22 -0700
Subject: Cell: move ztest before color interp/packing

---
 src/mesa/pipe/cell/spu/spu_tri.c | 43 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index b04b6841c0..ae8fd17cc6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -316,24 +316,6 @@ emit_quad( int x, int y, mask_t mask )
    setup.quad.mask = mask;
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
-   /* Cell: "write" quad fragments to the tile by setting prim color */
-   const int ix = x - setup.cliprect_minx;
-   const int iy = y - setup.cliprect_miny;
-   uint colors[4];  /* indexed by QUAD_x */
-
-   if (spu.texture.start) {
-      float4 texcoords[4];
-      uint i;
-      eval_coeff(2, (float) x, (float) y, texcoords);
-      for (i = 0; i < 4; i++) {
-         colors[i] = sample_texture(texcoords[i]);
-      }
-   }
-   else {
-      float4 fcolors[4];
-      eval_coeff(1, (float) x, (float) y, fcolors);
-      pack_colors(colors, fcolors);
-   }
 
    if (spu.depth_stencil.depth.enabled) {
       mask = do_depth_test(x, y, mask);
@@ -341,6 +323,23 @@ emit_quad( int x, int y, mask_t mask )
 
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+      uint colors[4];  /* indexed by QUAD_x */
+
+      if (spu.texture.start) {
+         float4 texcoords[4];
+         uint i;
+         eval_coeff(2, (float) x, (float) y, texcoords);
+         for (i = 0; i < 4; i++) {
+            colors[i] = sample_texture(texcoords[i]);
+         }
+      }
+      else {
+         float4 fcolors[4];
+         eval_coeff(1, (float) x, (float) y, fcolors);
+         pack_colors(colors, fcolors);
+      }
 
       if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
@@ -348,6 +347,7 @@ emit_quad( int x, int y, mask_t mask )
       }
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
+#if 1
       if (spu_extract(mask, 0))
          ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
@@ -356,11 +356,10 @@ emit_quad( int x, int y, mask_t mask )
          ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
          ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-
-#if 0
+#else
       /* SIMD_Z with swizzled color buffer (someday) */
-      vector float icolors = *((vector float *) &colors);
-      ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
+      vector unsigned int uicolors = *((vector unsigned int *) &colors);
+      ctile.ui4[iy/2][ix/2] = spu_sel(ctile.ui4[iy/2][ix/2], uicolors, mask);
 #endif
    }
 
-- 
cgit v1.2.3


From b25f1244ab8b082c01e45b1068c233f4c2b21fb3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:20:07 -0700
Subject: Cell: remove commands from top-level while loop which should only
 appear in batch buffers

---
 src/mesa/pipe/cell/spu/spu_main.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 7d6e910ad5..1760de02b7 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -447,34 +447,22 @@ main_loop(void)
               0  /* rid */);
       wait_on_mask( 1 << tag );
 
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          if (Debug)
             printf("SPU %u: EXIT\n", spu.init.id);
          exitFlag = 1;
          break;
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         cmd_state_framebuffer(&cmd.fb);
-         break;
-      case CELL_CMD_CLEAR_SURFACE:
-         cmd_clear_surface(&cmd.clear);
-         break;
-      case CELL_CMD_RENDER:
-         {
-            uint pos_incr;
-            cmd_render(&cmd.render, &pos_incr);
-            assert(pos_incr == 0);
-         }
-         break;
       case CELL_CMD_VS_EXECUTE:
          spu_execute_vertex_shader(&draw, &cmd.vs);
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
          break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         break;
       default:
          printf("Bad opcode!\n");
       }
-- 
cgit v1.2.3


From 4fa69471c40832967d106a39d6cca9b872609fbd Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:24:00 -0700
Subject: Cell: deprecate some use of struct cell_command - it should go away
 completely

Also, remove ALIGN16_ATTRIB from structs that no longer need it.
---
 src/mesa/pipe/cell/common.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 05aeed83ab..7e193f31be 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -105,7 +105,7 @@ struct cell_command_framebuffer
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
-} ALIGN16_ATTRIB;
+};
 
 
 /**
@@ -116,7 +116,7 @@ struct cell_command_clear_surface
    uint opcode;
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
-} ALIGN16_ATTRIB;
+};
 
 
 /**
@@ -173,7 +173,7 @@ struct cell_command_render
    uint dummy3;
    uint min_index;
    boolean inline_verts;
-} ALIGN16_ATTRIB;
+};
 
 
 struct cell_command_release_verts
@@ -191,11 +191,14 @@ struct cell_command_texture
 
 
 /** XXX unions don't seem to work */
+/* XXX this should go away; all commands should be placed in batch buffers */
 struct cell_command
 {
+#if 0
    struct cell_command_framebuffer fb;
    struct cell_command_clear_surface clear;
    struct cell_command_render render;
+#endif
    struct cell_command_vs vs;
 } ALIGN16_ATTRIB;
 
-- 
cgit v1.2.3


From 245c6a4cd5753ce4ef0e5bfa739e5e6ac4e9ae4e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:33:53 -0700
Subject: Cell: rename/move global vars

Put tile-related globals into spu_global struct.
Rename c/ztile fields to be more consistant.
---
 src/mesa/pipe/cell/spu/spu_main.c   | 28 +++++++++++++-------------
 src/mesa/pipe/cell/spu/spu_main.h   | 32 +++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_render.c | 40 ++++++++++++++++++-------------------
 src/mesa/pipe/cell/spu/spu_tile.c   | 11 +---------
 src/mesa/pipe/cell/spu/spu_tile.h   | 27 -------------------------
 src/mesa/pipe/cell/spu/spu_tri.c    | 38 +++++++++++++++++------------------
 6 files changed, 86 insertions(+), 90 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 1760de02b7..8e3987f6ef 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -92,24 +92,24 @@ really_clear_tiles(uint surfaceIndex)
    uint i;
 
    if (surfaceIndex == 0) {
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
          }
       }
    }
    else {
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status_z[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 1);
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
       }
    }
 
@@ -133,11 +133,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 #if CLEAR_OPT
    /* set all tile's status to CLEAR */
    if (clear->surface == 0) {
-      memset(tile_status, TILE_STATUS_CLEAR, sizeof(tile_status));
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
       spu.fb.color_clear_value = clear->value;
    }
    else {
-      memset(tile_status_z, TILE_STATUS_CLEAR, sizeof(tile_status_z));
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
       spu.fb.depth_clear_value = clear->value;
    }
    return;
@@ -145,11 +145,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
    }
    else {
       spu.fb.depth_clear_value = clear->value;
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
    }
 
    /*
@@ -161,9 +161,9 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
       uint tx = i % spu.fb.width_tiles;
       uint ty = i / spu.fb.width_tiles;
       if (clear->surface == 0)
-         put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
       else
-         put_tile(tx, ty, &ztile, TAG_SURFACE_CLEAR, 1);
+         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
       /* XXX we don't want this here, but it fixes bad tile results */
    }
 
@@ -478,8 +478,8 @@ main_loop(void)
 static void
 one_time_init(void)
 {
-   memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status));
-   memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z));
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 8be5268f52..cce5e70802 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -36,6 +36,11 @@
 #include "pipe/p_state.h"
 
 
+
+#define MAX_WIDTH 1024
+#define MAX_HEIGHT 1024
+
+
 typedef union
 {
    vector float v;
@@ -43,6 +48,21 @@ typedef union
 } float4;
 
 
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -75,6 +95,18 @@ struct spu_global
 
    /* XXX more state to come */
 
+
+   /** current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index ca54a103bd..ab711d67fe 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -95,15 +95,15 @@ static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
-      if (cur_tile_status_z != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         cur_tile_status_z = TILE_STATUS_GETTING;
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
       }
    }
 
-   if (cur_tile_status_c != TILE_STATUS_CLEAR) {
-      get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      cur_tile_status_c = TILE_STATUS_GETTING;
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
    }
 }
 
@@ -114,24 +114,24 @@ get_cz_tiles(uint tx, uint ty)
 static INLINE void
 put_cz_tiles(uint tx, uint ty)
 {
-   if (cur_tile_status_z == TILE_STATUS_DIRTY) {
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
-      put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-      cur_tile_status_z = TILE_STATUS_DEFINED;
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
-   else if (cur_tile_status_z == TILE_STATUS_GETTING) {
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
-      cur_tile_status_z = TILE_STATUS_DEFINED;
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
 
-   if (cur_tile_status_c == TILE_STATUS_DIRTY) {
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
-      put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-      cur_tile_status_c = TILE_STATUS_DEFINED;
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
-   else if (cur_tile_status_c == TILE_STATUS_GETTING) {
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
-      cur_tile_status_c = TILE_STATUS_DEFINED;
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
 }
 
@@ -250,8 +250,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
-      cur_tile_status_c = tile_status[ty][tx];
-      cur_tile_status_z = tile_status_z[ty][tx];
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
       get_cz_tiles(tx, ty);
 
@@ -275,8 +275,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
       wait_put_cz_tiles(); /* XXX seems unnecessary... */
 
-      tile_status[ty][tx] = cur_tile_status_c;
-      tile_status_z[ty][tx] = cur_tile_status_z;
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
    if (Debug)
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index fd65c2b49c..12dc246328 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -28,16 +28,7 @@
 
 
 #include "spu_tile.h"
-
-
-
-tile_t ctile ALIGN16_ATTRIB;
-tile_t ztile ALIGN16_ATTRIB;
-
-ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
-ubyte cur_tile_status_c, cur_tile_status_z;
+#include "spu_main.h"
 
 
 void
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 85a0d55807..e53340a55a 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -35,33 +35,6 @@
 #include "pipe/cell/common.h"
 
 
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-typedef union {
-   ushort us[TILE_SIZE][TILE_SIZE];
-   uint   ui[TILE_SIZE][TILE_SIZE];
-   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
-   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
-} tile_t;
-
-
-extern tile_t ctile ALIGN16_ATTRIB;
-extern tile_t ztile ALIGN16_ATTRIB;
-
-
-#define TILE_STATUS_CLEAR   1
-#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
-#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
-#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
-#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
-
-extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
-extern ubyte cur_tile_status_c, cur_tile_status_z;
-
 
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index ae8fd17cc6..6f61a3d816 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -283,21 +283,21 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-      cur_tile_status_z = TILE_STATUS_DIRTY;
+      clear_z_tile(&spu.ztile);
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
    }
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       int ix = (x - setup.cliprect_minx) / 4;
       int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
+      mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
       int ix = (x - setup.cliprect_minx) / 2;
       int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
+      mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
    }
    return mask;
 }
@@ -341,25 +341,25 @@ emit_quad( int x, int y, mask_t mask )
          pack_colors(colors, fcolors);
       }
 
-      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
+         clear_c_tile(&spu.ctile);
       }
-      cur_tile_status_c = TILE_STATUS_DIRTY;
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
 #if 1
       if (spu_extract(mask, 0))
-         ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
+         spu.ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
-         ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+         spu.ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (spu_extract(mask, 2))
-         ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+         spu.ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
-         ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+         spu.ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 #else
       /* SIMD_Z with swizzled color buffer (someday) */
       vector unsigned int uicolors = *((vector unsigned int *) &colors);
-      ctile.ui4[iy/2][ix/2] = spu_sel(ctile.ui4[iy/2][ix/2], uicolors, mask);
+      spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
 #endif
    }
 
@@ -846,21 +846,21 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
-   if (cur_tile_status_c == TILE_STATUS_GETTING) {
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* wait for mfc_get() to complete */
       wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      cur_tile_status_c = TILE_STATUS_CLEAN;
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
    }
 
-   ASSERT(cur_tile_status_c != TILE_STATUS_DEFINED);
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
    if (spu.depth_stencil.depth.enabled) {
-      if (cur_tile_status_z == TILE_STATUS_GETTING) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          wait_on_mask(1 << TAG_READ_TILE_Z);
-         cur_tile_status_z = TILE_STATUS_CLEAN;
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
       }
-   ASSERT(cur_tile_status_z != TILE_STATUS_DEFINED);
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
 
-- 
cgit v1.2.3


From de5d995201d617aca729efbc2821efde4b05685f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:45:02 -0700
Subject: Cell: New color packing functions (A8R8G8B8 and B8G8R8A8)

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 60 ++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_tri.c       | 22 +++++--------
 2 files changed, 69 insertions(+), 13 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_colorpack.h

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..56709bd9f3
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <vec_literal.h>
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  12, 0, 4, 8, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  8, 4, 0, 12, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 6f61a3d816..c82ca51000 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -29,11 +29,10 @@
  * Triangle rendering within a tile.
  */
 
-#include <pack_rgba8.h>
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
+#include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
@@ -253,21 +252,18 @@ eval_z(float x, float y)
 static INLINE void
 pack_colors(uint uicolors[4], const float4 fcolors[4])
 {
-   /* XXX grab the code for _pack_rgba8() and use the shuffle
-    * command to do the swizzling seen here.
-    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      uicolors[0] = _pack_rgba8(fcolors[0].f[3], fcolors[0].f[0], fcolors[0].f[1], fcolors[0].f[2]);
-      uicolors[1] = _pack_rgba8(fcolors[1].f[3], fcolors[1].f[0], fcolors[1].f[1], fcolors[1].f[2]);
-      uicolors[2] = _pack_rgba8(fcolors[2].f[3], fcolors[2].f[0], fcolors[2].f[1], fcolors[2].f[2]);
-      uicolors[3] = _pack_rgba8(fcolors[3].f[3], fcolors[0].f[0], fcolors[3].f[1], fcolors[3].f[2]);
+      uicolors[0] = spu_pack_A8R8G8B8(fcolors[0].v);
+      uicolors[1] = spu_pack_A8R8G8B8(fcolors[1].v);
+      uicolors[2] = spu_pack_A8R8G8B8(fcolors[2].v);
+      uicolors[3] = spu_pack_A8R8G8B8(fcolors[3].v);
       break;
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      uicolors[0] = _pack_rgba8(fcolors[0].f[2], fcolors[0].f[1], fcolors[0].f[0], fcolors[0].f[3]);
-      uicolors[1] = _pack_rgba8(fcolors[1].f[2], fcolors[1].f[1], fcolors[1].f[0], fcolors[1].f[3]);
-      uicolors[2] = _pack_rgba8(fcolors[2].f[2], fcolors[2].f[1], fcolors[2].f[0], fcolors[2].f[3]);
-      uicolors[3] = _pack_rgba8(fcolors[3].f[2], fcolors[3].f[1], fcolors[3].f[0], fcolors[3].f[3]);
+      uicolors[0] = spu_pack_B8G8R8A8(fcolors[0].v);
+      uicolors[1] = spu_pack_B8G8R8A8(fcolors[1].v);
+      uicolors[2] = spu_pack_B8G8R8A8(fcolors[2].v);
+      uicolors[3] = spu_pack_B8G8R8A8(fcolors[3].v);
       break;
    default:
       ASSERT(0);
-- 
cgit v1.2.3


From ae620d5c36c2742cb4a7429a461e02bd6dbef201 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:25:42 -0700
Subject: Cell: use global color_shuffle to remove a switch stmnt

---
 src/mesa/pipe/cell/spu/Makefile        |  2 +
 src/mesa/pipe/cell/spu/spu_colorpack.h |  9 ++++
 src/mesa/pipe/cell/spu/spu_main.c      | 12 ++++++
 src/mesa/pipe/cell/spu/spu_main.h      |  3 ++
 src/mesa/pipe/cell/spu/spu_tri.c       | 76 ++++++++++++++--------------------
 5 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 2d031bfbc6..91a631b699 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -8,6 +8,8 @@ TOP = ../../../../..
 include $(TOP)/configs/linux-cell
 
 
+OPT_FLAGS=-g
+OPT_FLAGS=-O3
 PROG = g3d
 
 PROG_SPU = $(PROG)_spu
diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 56709bd9f3..9977a6ece0 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -57,4 +57,13 @@ spu_pack_B8G8R8A8(vector float rgba)
 }
 
 
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
 #endif /* SPU_COLORPACK_H */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 8e3987f6ef..ba4d180cc0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,6 +31,7 @@
 
 #include <stdio.h>
 #include <libmisc.h>
+#include <vec_literal.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -217,6 +218,17 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 2;
    else
       spu.fb.zsize = 0;
+
+   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
+      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
+                                      12, 0, 4, 8, 0, 0, 0, 0, 
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
+                                      8, 4, 0, 12, 0, 0, 0, 0, 
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+   else
+      ASSERT(0);
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index cce5e70802..7a12715b0b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -107,6 +107,9 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+
+   /** for converting RGBA to PIPE_FORMAT_x colors */
+   vector unsigned char color_shuffle;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index c82ca51000..165e41a781 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -249,28 +249,6 @@ eval_z(float x, float y)
 }
 
 
-static INLINE void
-pack_colors(uint uicolors[4], const float4 fcolors[4])
-{
-   switch (spu.fb.color_format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      uicolors[0] = spu_pack_A8R8G8B8(fcolors[0].v);
-      uicolors[1] = spu_pack_A8R8G8B8(fcolors[1].v);
-      uicolors[2] = spu_pack_A8R8G8B8(fcolors[2].v);
-      uicolors[3] = spu_pack_A8R8G8B8(fcolors[3].v);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      uicolors[0] = spu_pack_B8G8R8A8(fcolors[0].v);
-      uicolors[1] = spu_pack_B8G8R8A8(fcolors[1].v);
-      uicolors[2] = spu_pack_B8G8R8A8(fcolors[2].v);
-      uicolors[3] = spu_pack_B8G8R8A8(fcolors[3].v);
-      break;
-   default:
-      ASSERT(0);
-   }
-}
-
-
 static INLINE mask_t
 do_depth_test(int x, int y, mask_t quadmask)
 {
@@ -321,38 +299,44 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      uint colors[4];  /* indexed by QUAD_x */
+
+      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+         /* now, _really_ clear the tile */
+         clear_c_tile(&spu.ctile);
+      }
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture.start) {
+         /* texture mapping */
          float4 texcoords[4];
-         uint i;
          eval_coeff(2, (float) x, (float) y, texcoords);
-         for (i = 0; i < 4; i++) {
-            colors[i] = sample_texture(texcoords[i]);
-         }
+
+         if (spu_extract(mask, 0))
+            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0]);
+         if (spu_extract(mask, 1))
+            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1]);
+         if (spu_extract(mask, 2))
+            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2]);
+         if (spu_extract(mask, 3))
+            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3]);
       }
       else {
-         float4 fcolors[4];
-         eval_coeff(1, (float) x, (float) y, fcolors);
-         pack_colors(colors, fcolors);
+         /* simple shading */
+         const vector unsigned char shuffle = spu.color_shuffle;
+         float4 colors[4];
+         eval_coeff(1, (float) x, (float) y, colors);
+
+         if (spu_extract(mask, 0))
+            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0].v, shuffle);
+         if (spu_extract(mask, 1))
+            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1].v, shuffle);
+         if (spu_extract(mask, 2))
+            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2].v, shuffle);
+         if (spu_extract(mask, 3))
+            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3].v, shuffle);
       }
 
-      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&spu.ctile);
-      }
-      spu.cur_ctile_status = TILE_STATUS_DIRTY;
-
-#if 1
-      if (spu_extract(mask, 0))
-         spu.ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
-      if (spu_extract(mask, 1))
-         spu.ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
-      if (spu_extract(mask, 2))
-         spu.ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
-      if (spu_extract(mask, 3))
-         spu.ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#else
+#if 0
       /* SIMD_Z with swizzled color buffer (someday) */
       vector unsigned int uicolors = *((vector unsigned int *) &colors);
       spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
-- 
cgit v1.2.3


From c90a2dcc294cba738fd65089e1b6ffe1c9b3853e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:42:09 -0700
Subject: Cell: move some tile get/clear code

Also, we weren't marking the ztile as dirty after ztesting, fixes gears glitches.
---
 src/mesa/pipe/cell/spu/spu_tri.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 165e41a781..f0758c42e7 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -257,12 +257,6 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&spu.ztile);
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
-   }
-
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       int ix = (x - setup.cliprect_minx) / 4;
       int iy = (y - setup.cliprect_miny) / 2;
@@ -273,6 +267,10 @@ do_depth_test(int x, int y, mask_t quadmask)
       int iy = (y - setup.cliprect_miny) / 2;
       mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
    }
+
+   if (spu_extract(spu_orx(mask), 0))
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
    return mask;
 }
 
@@ -300,10 +298,6 @@ emit_quad( int x, int y, mask_t mask )
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
 
-      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&spu.ctile);
-      }
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture.start) {
@@ -408,6 +402,18 @@ static void flush_spans( void )
       return;
    }
 
+
+   /* _really_ clear tiles now if needed */
+   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   if (spu.depth_stencil.depth.enabled &&
+       spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+      clear_z_tile(&spu.ztile);
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+   }
+
    /* XXX this loop could be moved into the above switch cases and
     * calculate_mask() could be simplified a bit...
     */
@@ -831,7 +837,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
       wait_on_mask(1 << TAG_READ_TILE_COLOR);
       spu.cur_ctile_status = TILE_STATUS_CLEAN;
    }
-
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
    if (spu.depth_stencil.depth.enabled) {
-- 
cgit v1.2.3


From 6522a0531fbb6b6d607969ee6b2c2a85cce8ad2b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:54:46 -0700
Subject: Cell: comment about emit_quad() mask

---
 src/mesa/pipe/cell/spu/spu_tri.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index f0758c42e7..83bb247b22 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -277,6 +277,9 @@ do_depth_test(int x, int y, mask_t quadmask)
 
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
  */
 static INLINE void
 emit_quad( int x, int y, mask_t mask )
-- 
cgit v1.2.3


From 8bbedc3f4b7b281a60286ba573077a6e3e659f63 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sat, 2 Feb 2008 14:21:06 +1100
Subject: nouveau: implement a bo_set_status()

---
 src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c   | 133 ++++++++++++---------
 .../drivers/dri/nouveau_winsys/nouveau_drmif.h     |   3 +
 .../dri/nouveau_winsys/nouveau_winsys_pipe.c       |   4 +-
 3 files changed, 80 insertions(+), 60 deletions(-)

diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c b/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c
index 4c235845b7..6887ffa688 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_bo.c
@@ -79,37 +79,6 @@ nouveau_mem_alloc(struct nouveau_device *dev, unsigned size, unsigned align,
 	return 0;
 }
 
-static int
-nouveau_bo_realloc_gpu(struct nouveau_bo_priv *nvbo, uint32_t flags, int size)
-{
-	int ret;
-
-	if (nvbo->drm.size && nvbo->drm.size != size) {
-		nouveau_mem_free(nvbo->base.device, &nvbo->drm, &nvbo->map);
-	}
-
-	if (size && !nvbo->drm.size) {
-		if (flags) {
-			nvbo->drm.flags = 0;
-			if (flags & NOUVEAU_BO_VRAM)
-				nvbo->drm.flags |= NOUVEAU_MEM_FB;
-			if (flags & NOUVEAU_BO_GART)
-				nvbo->drm.flags |= (NOUVEAU_MEM_AGP |
-						    NOUVEAU_MEM_PCI);
-			nvbo->drm.flags |= NOUVEAU_MEM_MAPPED;
-		}
-
-		ret = nouveau_mem_alloc(nvbo->base.device, size,
-					nvbo->drm.alignment, nvbo->drm.flags,
-					&nvbo->drm, &nvbo->map);
-		if (ret) {
-			assert(0);
-		}
-	}
-
-	return 0;
-}
-
 static void
 nouveau_bo_tmp_del(void *priv)
 {
@@ -183,26 +152,17 @@ nouveau_bo_new(struct nouveau_device *dev, uint32_t flags, int align,
 	if (!nvbo)
 		return -ENOMEM;
 	nvbo->base.device = dev;
+	nvbo->base.size = size;
+	nvbo->base.handle = bo_to_ptr(nvbo);
 	nvbo->drm.alignment = align;
+	nvbo->refcount = 1;
 
-	if (flags & NOUVEAU_BO_PIN) {
-		ret = nouveau_bo_realloc_gpu(nvbo, flags, size);
-		if (ret) {
-			free(nvbo);
-			return ret;
-		}	
-	} else {
-		nvbo->sysmem = malloc(size);
-		if (!nvbo->sysmem) {
-			free(nvbo);
-			return -ENOMEM;
-		}
+	ret = nouveau_bo_set_status(&nvbo->base, flags);
+	if (ret) {
+		free(nvbo);
+		return ret;
 	}
 
-	nvbo->base.size = size;
-	nvbo->base.offset = nvbo->drm.offset;
-	nvbo->base.handle = bo_to_ptr(nvbo);
-	nvbo->refcount = 1;
 	*bo = &nvbo->base;
 	return 0;
 }
@@ -261,8 +221,7 @@ nouveau_bo_del(struct nouveau_bo **bo)
 
 	if (nvbo->fence)
 		nouveau_fence_wait(&nvbo->fence);
-
-	nouveau_bo_realloc_gpu(nvbo, 0, 0);
+	nouveau_mem_free(nvbo->base.device, &nvbo->drm, &nvbo->map);
 	if (nvbo->sysmem && !nvbo->user)
 		free(nvbo->sysmem);
 	free(nvbo);
@@ -303,6 +262,66 @@ nouveau_bo_upload(struct nouveau_bo_priv *nvbo)
 	return 0;
 }
 
+int
+nouveau_bo_set_status(struct nouveau_bo *bo, uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct drm_nouveau_mem_alloc new;
+	void *new_map = NULL, *new_sysmem = NULL;
+	unsigned new_flags = 0, ret;
+
+	assert(!bo->map);
+
+	/* Check current memtype vs requested, if they match do nothing */
+	if ((nvbo->drm.flags & NOUVEAU_MEM_FB) && (flags & NOUVEAU_BO_VRAM))
+		return 0;
+	if ((nvbo->drm.flags & NOUVEAU_MEM_AGP) && (flags & NOUVEAU_BO_GART))
+		return 0;
+	if (nvbo->drm.size == 0 && nvbo->sysmem && (flags & NOUVEAU_BO_LOCAL))
+		return 0;
+
+	memset(&new, 0x00, sizeof(new));
+
+	/* Allocate new memory */
+	if (flags & NOUVEAU_BO_VRAM)
+		new_flags |= NOUVEAU_MEM_FB;
+	else
+	if (flags & NOUVEAU_BO_GART)
+		new_flags |= (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI);
+
+	if (new_flags) {
+		ret = nouveau_mem_alloc(bo->device, bo->size,
+					nvbo->drm.alignment, new_flags,
+					&new, &new_map);
+		if (ret)
+			return ret;
+	} else {
+		new_sysmem = malloc(bo->size);
+	}
+
+	/* Copy old -> new */
+	/*XXX: use M2MF */
+	if (nvbo->sysmem || nvbo->map) {
+		nouveau_bo_map(bo, NOUVEAU_BO_RD);
+		memcpy(new_map, bo->map, bo->size);
+		nouveau_bo_unmap(bo);
+	}
+
+	/* Free old memory */
+	if (nvbo->fence)
+		nouveau_fence_wait(&nvbo->fence);
+	nouveau_mem_free(bo->device, &nvbo->drm, &nvbo->map);
+	if (nvbo->sysmem)
+		free(nvbo->sysmem);
+
+	nvbo->drm = new;
+	nvbo->map = new_map;
+	nvbo->sysmem = new_sysmem;
+	bo->flags = flags;
+	bo->offset = nvbo->drm.offset;
+	return 0;
+}
+
 static int
 nouveau_bo_validate_user(struct nouveau_channel *chan, struct nouveau_bo *bo,
 			 struct nouveau_fence *fence, uint32_t flags)
@@ -335,18 +354,14 @@ nouveau_bo_validate_bo(struct nouveau_channel *chan, struct nouveau_bo *bo,
 		       struct nouveau_fence *fence, uint32_t flags)
 {
 	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	int ret;
 
-	if (!nvbo->drm.size) {
-		nouveau_bo_realloc_gpu(nvbo, flags, nvbo->base.size);
-		nouveau_bo_upload(nvbo);
-		if (!nvbo->user) {
-			free(nvbo->sysmem);
-			nvbo->sysmem = NULL;
-		}
-	} else
-	if (nvbo->user) {
+	ret = nouveau_bo_set_status(bo, flags);
+	if (ret)
+		return ret;
+
+	if (nvbo->user)
 		nouveau_bo_upload(nvbo);
-	}
 
 	nvbo->offset = nvbo->drm.offset;
 	if (nvbo->drm.flags & (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI))
diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h b/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h
index 7ea4c65465..67e19f1cfe 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_drmif.h
@@ -274,6 +274,9 @@ nouveau_bo_user(struct nouveau_device *, void *ptr, int size,
 extern int
 nouveau_bo_ref(struct nouveau_device *, uint64_t handle, struct nouveau_bo **);
 
+extern int
+nouveau_bo_set_status(struct nouveau_bo *, uint32_t flags);
+
 extern void
 nouveau_bo_del(struct nouveau_bo **);
 
diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c b/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c
index f2087aaf9a..7d7fefa801 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c
@@ -90,6 +90,7 @@ nouveau_pipe_bo_create(struct pipe_winsys *pws, unsigned alignment,
 	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
 	struct nouveau_device *dev = nvpws->nv->nv_screen->device;
 	struct nouveau_pipe_buffer *nvbuf;
+	uint32_t flags = 0;
 
 	nvbuf = calloc(1, sizeof(*nvbuf));
 	if (!nvbuf)
@@ -99,7 +100,8 @@ nouveau_pipe_bo_create(struct pipe_winsys *pws, unsigned alignment,
 	nvbuf->base.usage = usage;
 	nvbuf->base.size = size;
 
-	if (nouveau_bo_new(dev, NOUVEAU_BO_LOCAL, alignment, size, &nvbuf->bo)) {
+	flags = NOUVEAU_BO_LOCAL;
+	if (nouveau_bo_new(dev, flags, alignment, size, &nvbuf->bo)) {
 		free(nvbuf);
 		return NULL;
 	}
-- 
cgit v1.2.3


From 705022f98c32c44b94411ea13dfe4cbc899f5a77 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sun, 3 Feb 2008 12:08:31 +1100
Subject: nouveau: avoid relocations where possible.

Potential relocations are emitted as NOPs where they're needed.  In the
event a buffer moves, the pushbuf code will emit the relevant state
changes into the NOPs.

Just a start, more work is needed to get this looking how I want it to.
---
 .../drivers/dri/nouveau_winsys/nouveau_local.h     |   3 +-
 .../drivers/dri/nouveau_winsys/nouveau_pushbuf.c   |  61 +++++----
 src/mesa/pipe/nouveau/nouveau_bo.h                 |   1 +
 src/mesa/pipe/nouveau/nouveau_push.h               |  13 +-
 src/mesa/pipe/nv40/nv40_fragprog.c                 |   5 +
 src/mesa/pipe/nv40/nv40_fragtex.c                  |   8 +-
 src/mesa/pipe/nv40/nv40_state.c                    |  32 ++++-
 src/mesa/pipe/nv40/nv40_state_emit.c               | 144 ++++++++++++---------
 8 files changed, 170 insertions(+), 97 deletions(-)

diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h b/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h
index 7a539c81a9..59febca292 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h
@@ -61,9 +61,8 @@
 } while(0)
 
 #define OUT_RELOC(buf,data,flags,vor,tor) do {                                 \
-	nouveau_pipe_emit_reloc(nv->channel, nv->channel->pushbuf->cur,        \
+	nouveau_pipe_emit_reloc(nv->channel, nv->channel->pushbuf->cur++,      \
 				   buf, (data), (flags), (vor), (tor));        \
-	OUT_RING(0);                                                           \
 } while(0)
 
 /* Raw data + flags depending on FB/TT buffer */
diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c b/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c
index a34a5c1866..7d5eddb92f 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c
@@ -96,6 +96,31 @@ nouveau_pushbuf_init(struct nouveau_channel *chan)
 	return 0;
 }
 
+static uint32_t
+nouveau_pushbuf_calc_reloc(struct nouveau_bo *bo,
+			   struct nouveau_pushbuf_reloc *r)
+{
+	uint32_t push;
+
+	if (r->flags & NOUVEAU_BO_LOW) {
+		push = bo->offset + r->data;
+	} else
+	if (r->flags & NOUVEAU_BO_HIGH) {
+		push = (bo->offset + r->data) >> 32;
+	} else {
+		push = r->data;
+	}
+
+	if (r->flags & NOUVEAU_BO_OR) {
+		if (bo->flags & NOUVEAU_BO_VRAM)
+			push |= r->vor;
+		else
+			push |= r->tor;
+	}
+
+	return push;
+}
+
 /* This would be our TTM "superioctl" */
 int
 nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min)
@@ -133,34 +158,20 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min)
 
 		if (bo->offset == nouveau_bo(bo)->offset &&
 		    bo->flags == nouveau_bo(bo)->flags) {
-			/*XXX: could avoid reloc in this case, except with the
-			 *     current design we'd confuse the GPU quite a bit
-			 *     if we did this.  Will fix soon.
-			 */
+			while ((r = ptr_to_pbrel(pbbo->relocs))) {
+				pbbo->relocs = r->next;
+				free(r);
+			}
+
+			nvpb->buffers = pbbo->next;
+			free(pbbo);
+			continue;
 		}
 		bo->offset = nouveau_bo(bo)->offset;
 		bo->flags = nouveau_bo(bo)->flags;
 
 		while ((r = ptr_to_pbrel(pbbo->relocs))) {
-			uint32_t push;
-
-			if (r->flags & NOUVEAU_BO_LOW) {
-				push = bo->offset + r->data;
-			} else
-			if (r->flags & NOUVEAU_BO_HIGH) {
-				push = (bo->offset + r->data) >> 32;
-			} else {
-				push = r->data;
-			}
-
-			if (r->flags & NOUVEAU_BO_OR) {
-				if (bo->flags & NOUVEAU_BO_VRAM)
-					push |= r->vor;
-				else
-					push |= r->tor;
-			}
-
-			*r->ptr = push;
+			*r->ptr = nouveau_pushbuf_calc_reloc(bo, r);
 			pbbo->relocs = r->next;
 			free(r);
 		}
@@ -241,6 +252,10 @@ nouveau_pushbuf_emit_reloc(struct nouveau_channel *chan, void *ptr,
 	r->vor = vor;
 	r->tor = tor;
 
+	if (flags & NOUVEAU_BO_DUMMY)
+		*(uint32_t *)ptr = 0;
+	else
+		*(uint32_t *)ptr = nouveau_pushbuf_calc_reloc(bo, r);
 	return 0;
 }
 
diff --git a/src/mesa/pipe/nouveau/nouveau_bo.h b/src/mesa/pipe/nouveau/nouveau_bo.h
index 2b57ee9263..18020e9c65 100644
--- a/src/mesa/pipe/nouveau/nouveau_bo.h
+++ b/src/mesa/pipe/nouveau/nouveau_bo.h
@@ -35,6 +35,7 @@
 #define NOUVEAU_BO_HIGH  (1 << 7)
 #define NOUVEAU_BO_OR    (1 << 8)
 #define NOUVEAU_BO_LOCAL (1 << 9)
+#define NOUVEAU_BO_DUMMY (1 << 31)
 
 struct nouveau_bo {
 	struct nouveau_device *device;
diff --git a/src/mesa/pipe/nouveau/nouveau_push.h b/src/mesa/pipe/nouveau/nouveau_push.h
index 117e3535cf..679472669b 100644
--- a/src/mesa/pipe/nouveau/nouveau_push.h
+++ b/src/mesa/pipe/nouveau/nouveau_push.h
@@ -44,9 +44,8 @@
 #define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
 	NOUVEAU_PUSH_CONTEXT(pc);                                              \
 	pc->nvws->push_reloc(pc->nvws->channel,                                \
-  		 	     pc->nvws->channel->pushbuf->cur,                  \
+  		 	     pc->nvws->channel->pushbuf->cur++,                \
 			     (bo), (data), (flags), (vor), (tor));             \
-	OUT_RING(0);                                                           \
 } while(0)
 
 /* Raw data + flags depending on FB/TT buffer */
@@ -71,4 +70,14 @@
 	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
 } while(0)
 
+/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
+#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws->channel, ((size) + 1));         \
+	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
+		   (flags), 0, 0);                                             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
 #endif
diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
index 14897f9798..667eb89cb2 100644
--- a/src/mesa/pipe/nv40/nv40_fragprog.c
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -815,6 +815,11 @@ nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp)
 		fp->on_hw = TRUE;
 	}
 
+	BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1);
+	OUT_RELOC (fp->buffer, 0, NOUVEAU_BO_VRAM |
+	           NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+		   NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0,
+		   NV40TCL_FP_ADDRESS_DMA1);
 	BEGIN_RING(curie, NV40TCL_FP_CONTROL, 1);
 	OUT_RING  (fp->fp_control);
 
diff --git a/src/mesa/pipe/nv40/nv40_fragtex.c b/src/mesa/pipe/nv40/nv40_fragtex.c
index 48d6eb629f..7c5ecd5c56 100644
--- a/src/mesa/pipe/nv40/nv40_fragtex.c
+++ b/src/mesa/pipe/nv40/nv40_fragtex.c
@@ -104,7 +104,13 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 	nv40->tex[unit].buffer = nv40mt->buffer;
 	nv40->tex[unit].format = txf;
 
-	BEGIN_RING(curie, NV40TCL_TEX_WRAP(unit), 6);
+	BEGIN_RING(curie, NV40TCL_TEX_OFFSET(unit), 8);
+	OUT_RELOCl(nv40->tex[unit].buffer, 0, NOUVEAU_BO_VRAM |
+		   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv40->tex[unit].buffer, nv40->tex[unit].format,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+		   NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
+		   NV40TCL_TEX_FORMAT_DMA1);
 	OUT_RING  (ps->wrap);
 	OUT_RING  (NV40TCL_TEX_ENABLE_ENABLE | ps->en |
 		   (0x00078000) /* mipmap related? */);
diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c
index c619948b55..bb435b106b 100644
--- a/src/mesa/pipe/nv40/nv40_state.c
+++ b/src/mesa/pipe/nv40/nv40_state.c
@@ -603,33 +603,51 @@ nv40_set_framebuffer_state(struct pipe_context *pipe,
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-		BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 1);
-		OUT_RING  (rt[0]->pitch * rt[0]->cpp);
 		nv40->rt[0] = rt[0]->buffer;
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1);
+		OUT_RELOCo(nv40->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 2);
+		OUT_RING  (rt[0]->pitch * rt[0]->cpp);
+		OUT_RELOCl(nv40->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-		BEGIN_RING(curie, NV40TCL_COLOR1_PITCH, 2);
-		OUT_RING  (rt[1]->pitch * rt[1]->cpp);
 		nv40->rt[1] = rt[1]->buffer;
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
+		OUT_RELOCo(nv40->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 2);
+		OUT_RELOCl(nv40->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		OUT_RING  (rt[1]->pitch * rt[1]->cpp);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+		nv40->rt[2] = rt[2]->buffer;
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1);
+		OUT_RELOCo(nv40->rt[2], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1);
+		OUT_RELOCl(nv40->rt[2], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		BEGIN_RING(curie, NV40TCL_COLOR2_PITCH, 1);
 		OUT_RING  (rt[2]->pitch * rt[2]->cpp);
-		nv40->rt[2] = rt[2]->buffer;
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+		nv40->rt[3] = rt[3]->buffer;
+		BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1);
+		OUT_RELOCo(nv40->rt[3], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1);
+		OUT_RELOCl(nv40->rt[3], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		BEGIN_RING(curie, NV40TCL_COLOR3_PITCH, 1);
 		OUT_RING  (rt[3]->pitch * rt[3]->cpp);
-		nv40->rt[3] = rt[3]->buffer;
 	}
 
 	if (zeta_format) {
+		nv40->zeta = zeta->buffer;
+		BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1);
+		OUT_RELOCo(nv40->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv40->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		BEGIN_RING(curie, NV40TCL_ZETA_PITCH, 1);
 		OUT_RING  (zeta->pitch * zeta->cpp);
-		nv40->zeta = zeta->buffer;
 	}
 
 	nv40->rt_enable = rt_enable;
diff --git a/src/mesa/pipe/nv40/nv40_state_emit.c b/src/mesa/pipe/nv40/nv40_state_emit.c
index c9a7a2e364..66b98d5fab 100644
--- a/src/mesa/pipe/nv40/nv40_state_emit.c
+++ b/src/mesa/pipe/nv40/nv40_state_emit.c
@@ -1,94 +1,114 @@
 #include "nv40_context.h"
 #include "nv40_state.h"
 
-void
-nv40_emit_hw_state(struct nv40_context *nv40)
+/* Emit relocs for every referenced buffer.
+ *
+ * This is to ensure the bufmgr has an accurate idea of how
+ * the buffer is used.  These relocs appear in the push buffer as
+ * NOPs, and will only be turned into state changes if a buffer
+ * actually moves.
+ */
+static void
+nv40_state_emit_dummy_relocs(struct nv40_context *nv40)
 {
-	int i;
-
-	if (nv40->dirty & NV40_NEW_FRAGPROG) {
-		nv40_fragprog_bind(nv40, nv40->fragprog.current);
-		/*XXX: clear NV40_NEW_FRAGPROG if no new program uploaded */
-	}
-
-	if (nv40->dirty_samplers || (nv40->dirty & NV40_NEW_FRAGPROG)) {
-		nv40_fragtex_bind(nv40);
-
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (2);
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (1);
-		nv40->dirty &= ~NV40_NEW_FRAGPROG;
-	}
-
-	if (nv40->dirty & NV40_NEW_VERTPROG) {
-		nv40_vertprog_bind(nv40, nv40->vertprog.current);
-		nv40->dirty &= ~NV40_NEW_VERTPROG;
-	}
-
-	nv40->dirty_samplers = 0;
-
-	/* Emit relocs for every referenced buffer.
-	 * This is to ensure the bufmgr has an accurate idea of how
-	 * the buffer is used.  This isn't very efficient, but we don't
-	 * seem to take a significant performance hit.  Will be improved
-	 * at some point.  Vertex arrays are emitted by nv40_vbo.c
-	 */
+	unsigned rt_flags, tx_flags, fp_flags;
+	int i;	
+	
+	rt_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR | NOUVEAU_BO_DUMMY;
+	tx_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+		   NOUVEAU_BO_DUMMY;
+	fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+		   NOUVEAU_BO_DUMMY;
 
 	/* Render targets */
 	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1);
-		OUT_RELOCo(nv40->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR0_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		OUT_RELOCm(nv40->rt[0], rt_flags,
+			   curie, NV40TCL_DMA_COLOR0, 1);
+		OUT_RELOCo(nv40->rt[0], rt_flags);
+		OUT_RELOCm(nv40->rt[0], rt_flags,
+			   curie, NV40TCL_COLOR0_OFFSET, 1);
+		OUT_RELOCl(nv40->rt[0], 0, rt_flags);
 	}
 
 	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
-		OUT_RELOCo(nv40->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		OUT_RELOCm(nv40->rt[1], rt_flags,
+			   curie, NV40TCL_DMA_COLOR1, 1);
+		OUT_RELOCo(nv40->rt[1], rt_flags);
+		OUT_RELOCm(nv40->rt[1], rt_flags,
+			   curie, NV40TCL_COLOR1_OFFSET, 1);
+		OUT_RELOCl(nv40->rt[1], 0, rt_flags);
 	}
 
 	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1);
-		OUT_RELOCo(nv40->rt[2], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[2], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		OUT_RELOCm(nv40->rt[2], rt_flags,
+			   curie, NV40TCL_DMA_COLOR2, 1);
+		OUT_RELOCo(nv40->rt[2], rt_flags);
+		OUT_RELOCm(nv40->rt[2], rt_flags,
+			   curie, NV40TCL_COLOR2_OFFSET, 1);
+		OUT_RELOCl(nv40->rt[2], 0, rt_flags);
 	}
 
 	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1);
-		OUT_RELOCo(nv40->rt[3], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[3], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		OUT_RELOCm(nv40->rt[3], rt_flags,
+			   curie, NV40TCL_DMA_COLOR3, 1);
+		OUT_RELOCo(nv40->rt[3], rt_flags);
+		OUT_RELOCm(nv40->rt[3], rt_flags,
+			   curie, NV40TCL_COLOR3_OFFSET, 1);
+		OUT_RELOCl(nv40->rt[3], 0, rt_flags);
 	}
 
 	if (nv40->zeta) {
-		BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1);
-		OUT_RELOCo(nv40->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv40->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_DMA_ZETA, 1);
+		OUT_RELOCo(nv40->zeta, rt_flags);
+		OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv40->zeta, 0, rt_flags);
 	}
 
 	/* Texture images */
 	for (i = 0; i < 16; i++) {
 		if (!(nv40->fp_samplers & (1 << i)))
 			continue;
-		BEGIN_RING(curie, NV40TCL_TEX_OFFSET(i), 2);
-		OUT_RELOCl(nv40->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
-			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCm(nv40->tex[i].buffer, tx_flags,
+			   curie, NV40TCL_TEX_OFFSET(i), 2);
+		OUT_RELOCl(nv40->tex[i].buffer, 0, tx_flags);
 		OUT_RELOCd(nv40->tex[i].buffer, nv40->tex[i].format,
-			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
-			   NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
+			   tx_flags | NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
 			   NV40TCL_TEX_FORMAT_DMA1);
 	}
 
 	/* Fragment program */
-	BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1);
-	OUT_RELOC (nv40->fragprog.active->buffer, 0, NOUVEAU_BO_VRAM |
-	           NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
-		   NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0,
-		   NV40TCL_FP_ADDRESS_DMA1);
+	OUT_RELOCm(nv40->fragprog.active->buffer, fp_flags,
+		   curie, NV40TCL_FP_ADDRESS, 1);
+	OUT_RELOC (nv40->fragprog.active->buffer, 0,
+		   fp_flags | NOUVEAU_BO_OR | NOUVEAU_BO_LOW,
+		   NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+}
+
+void
+nv40_emit_hw_state(struct nv40_context *nv40)
+{
+	if (nv40->dirty & NV40_NEW_FRAGPROG) {
+		nv40_fragprog_bind(nv40, nv40->fragprog.current);
+		/*XXX: clear NV40_NEW_FRAGPROG if no new program uploaded */
+	}
+
+	if (nv40->dirty_samplers || (nv40->dirty & NV40_NEW_FRAGPROG)) {
+		nv40_fragtex_bind(nv40);
+
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
+		nv40->dirty &= ~NV40_NEW_FRAGPROG;
+	}
+
+	if (nv40->dirty & NV40_NEW_VERTPROG) {
+		nv40_vertprog_bind(nv40, nv40->vertprog.current);
+		nv40->dirty &= ~NV40_NEW_VERTPROG;
+	}
+
+	nv40->dirty_samplers = 0;
+
+	nv40_state_emit_dummy_relocs(nv40);
 }
 
-- 
cgit v1.2.3


From b111d266a0edf334e8c315129482005a224cb899 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Mon, 4 Feb 2008 01:37:07 +1100
Subject: nv40: the 0x4497 version of curie doesn't support index buffers.

---
 src/mesa/pipe/nv40/nv40_vbo.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
index 3fa8ddcb81..0bb54c7610 100644
--- a/src/mesa/pipe/nv40/nv40_vbo.c
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -391,12 +391,17 @@ nv40_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
 {
-	if (indexSize != 1) {
-		nv40_draw_elements_vbo(pipe, indexBuffer, indexSize,
-				       mode, start, count);
-	} else {
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	/* 0x4497 doesn't support real index buffers, and there doesn't appear
+	 * to be support on any chipset for 8-bit indices.
+	 */
+	if (nv40->curie->grclass == NV44TCL || indexSize == 1) {
 		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
 					  mode, start, count);
+	} else {
+		nv40_draw_elements_vbo(pipe, indexBuffer, indexSize,
+				       mode, start, count);
 	}
 
 	pipe->flush(pipe, 0);
-- 
cgit v1.2.3


From 4605b7df0a98025999169254f08e532027a8a46a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:45:33 -0700
Subject: Cell: insert some draw_flush() calls

---
 src/mesa/pipe/cell/ppu/cell_state_blend.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c
index 34ae0128ea..2c19aa3971 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_blend.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c
@@ -29,6 +29,7 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
@@ -49,6 +50,8 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend = (const struct pipe_blend_state *)blend;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -68,6 +71,8 @@ cell_set_blend_color(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend_color = *blend_color;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -93,6 +98,8 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->depth_stencil
       = (const struct pipe_depth_stencil_alpha_state *) depth_stencil;
 
-- 
cgit v1.2.3


From a0c35df4a0d1619b2d8593d35456ed50be3b03c1 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:46:44 -0700
Subject: Cell: clamp txmax, tymax in tile_bounding_box()

Also, added some debug printfs
---
 src/mesa/pipe/cell/spu/spu_render.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index ab711d67fe..e8705eeeba 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -65,6 +65,10 @@ tile_bounding_box(const struct cell_command_render *render,
    *tymin = (uint) render->ymin / TILE_SIZE;
    txmax = (uint) render->xmax / TILE_SIZE;
    tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
    *box_width_tiles = txmax - *txmin + 1;
    box_height_tiles = tymax - *tymin + 1;
    *box_num_tiles = *box_width_tiles * box_height_tiles;
@@ -96,12 +100,14 @@ get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
          spu.cur_ztile_status = TILE_STATUS_GETTING;
       }
    }
 
    if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
       get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
       spu.cur_ctile_status = TILE_STATUS_GETTING;
    }
@@ -116,22 +122,26 @@ put_cz_tiles(uint tx, uint ty)
 {
    if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
       put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
       spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
    else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
       spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
    }
 
    if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
       put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
       spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
    else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
       spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
    }
 }
 
-- 
cgit v1.2.3


From d83dedc937641de247bebbefad649719f619cdeb Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:53:18 -0700
Subject: Cell: move tile clear code to flush_spans()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 51 ++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 83bb247b22..3f46e75d7c 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -406,22 +406,44 @@ static void flush_spans( void )
    }
 
 
-   /* _really_ clear tiles now if needed */
-   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
       clear_c_tile(&spu.ctile);
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
-   if (spu.depth_stencil.depth.enabled &&
-       spu.cur_ztile_status == TILE_STATUS_CLEAR) {
-      clear_z_tile(&spu.ztile);
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
    /* XXX this loop could be moved into the above switch cases and
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
+#if 1
       emit_quad( x, setup.span.y, calculate_mask( x ) );
+#endif
    }
 
    setup.span.y = 0;
@@ -835,23 +857,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
-   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
-      /* wait for mfc_get() to complete */
-      wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      spu.cur_ctile_status = TILE_STATUS_CLEAN;
-   }
-   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
-
-   if (spu.depth_stencil.depth.enabled) {
-      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
-         /* wait for mfc_get() to complete */
-         wait_on_mask(1 << TAG_READ_TILE_Z);
-         spu.cur_ztile_status = TILE_STATUS_CLEAN;
-      }
-      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
-   }
-
-
    if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
-- 
cgit v1.2.3


From f94e0396ed023c21087d1ea1a849b1f8124f9ffb Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 09:54:21 -0700
Subject: Cell: checkpoint: start to SIMD-ize texture sampling

---
 src/mesa/pipe/cell/spu/spu_main.c    | 10 ++++++++++
 src/mesa/pipe/cell/spu/spu_main.h    |  4 ++++
 src/mesa/pipe/cell/spu/spu_texture.c | 17 ++++++++++++++---
 src/mesa/pipe/cell/spu/spu_texture.h |  2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     |  8 ++++----
 5 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index ba4d180cc0..412661061a 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -263,6 +263,16 @@ cmd_state_texture(const struct cell_command_texture *texture)
              spu.init.id, texture->start, texture->width, texture->height);
 
    memcpy(&spu.texture, texture, sizeof(*texture));
+   spu.tex_size = VEC_LITERAL(vector float,
+                              spu.texture.width,
+                              spu.texture.height,
+                              0.0,
+                              0.0);
+   spu.tex_size_mask = VEC_LITERAL(vector unsigned int,
+                                   spu.texture.width - 1,
+                                   spu.texture.height - 1,
+                                   0,
+                                   0);
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 7a12715b0b..02b62ee5cd 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -110,6 +110,10 @@ struct spu_global
 
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
+
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index c1dc6bfe90..1cf958806f 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -128,12 +128,23 @@ get_tex_tile(uint i, uint j)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(float4 texcoord)
+sample_texture(vector float texcoord)
 {
+#if 0
    /* wrap/repeat */
-   uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
+   uint i = (uint) (spu_extract(texcoord, 0) * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (spu_extract(texcoord, 1) * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
    uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
+#else
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   vector unsigned int itc = spu_convtu(tc, 0);
+   itc = spu_and(itc, spu.tex_size_mask);
+   uint i = spu_extract(itc, 0);
+   uint j = spu_extract(itc, 1);
+   uint pos = get_tex_tile(i, j);
+   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
+   return texel;
+#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 938a42b549..5bc8e71879 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(float4 texcoord);
+sample_texture(vector float texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 3f46e75d7c..c148c75dd6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -309,13 +309,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0]);
+            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0].v);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1]);
+            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1].v);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2]);
+            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2].v);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3]);
+            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3].v);
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From 09edd2e29e023b326ba3f6fff671dd1db3ab1eea Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 11:02:47 -0700
Subject: Cell: SIMD-ize more of texture sampling

---
 src/mesa/pipe/cell/spu/spu_texture.c | 66 ++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 1cf958806f..b52df970d0 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <vec_literal.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -41,7 +43,7 @@
 
 static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
 
-static int tex_tile_x[CACHE_SIZE], tex_tile_y[CACHE_SIZE];
+static vector unsigned int tex_tile_xy[CACHE_SIZE];
 
 
@@ -53,20 +55,19 @@ invalidate_tex_cache(void)
 {
    /* XXX memset? */
    uint i;
-   for (i = 0; i < CACHE_SIZE; i++)
-      tex_tile_x[i] = tex_tile_y[i] = -1;
+   for (i = 0; i < CACHE_SIZE; i++) {
+      tex_tile_xy[i] = VEC_LITERAL(vector unsigned int, ~0U, ~0U, ~0U, ~0U);
+   }
 }
 
 
 /**
- * Return the cache pos/index which corresponds to texel (i,j)
+ * Return the cache pos/index which corresponds to tile (tx,ty)
  */
 static INLINE uint
-cache_pos(uint i, uint j)
+cache_pos(vector unsigned int txty)
 {
-   uint tx = i / TILE_SIZE;
-   uint ty = j / TILE_SIZE;
-   uint pos = (tx + ty * 4) % CACHE_SIZE;
+   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
    return pos;
 }
 
@@ -76,26 +77,28 @@ cache_pos(uint i, uint j)
  * in the cache.
  */
 static uint
-get_tex_tile(uint i, uint j)
+get_tex_tile(vector unsigned int ij)
 {
-   const int tx = i / TILE_SIZE;
-   const int ty = j / TILE_SIZE;
-   const uint pos = cache_pos(i, j);
+   /* tile address: tx,ty */
+   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
+   const uint pos = cache_pos(txty);
+
+   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
+       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
 
-   if (tex_tile_x[pos] != tx || tex_tile_y[pos] != ty) {
       /* texture cache miss, fetch tile from main memory */
       const uint tiles_per_row = spu.texture.width / TILE_SIZE;
       const uint bytes_per_tile = sizeof(tile_t);
       const void *src = (const ubyte *) spu.texture.start
-         + (ty * tiles_per_row + tx) * bytes_per_tile;
+         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
 
       printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
-             spu.init.id, tx, ty, pos,
-             tex_tile_x[pos], tex_tile_y[pos]);
-#if 0
-      printf("SPU %u: get tex tile from %p to %p\n",
-             spu.init.id, src, tex_tiles[pos].t32);
-#endif
+             spu.init.id,
+             spu_extract(txty,0),
+             spu_extract(txty,1),
+             pos,
+             spu_extract(tex_tile_xy[pos],0),
+             spu_extract(tex_tile_xy[pos],1));
 
       ASSERT_ALIGN16(tex_tiles[pos].ui);
       ASSERT_ALIGN16(src);
@@ -109,8 +112,7 @@ get_tex_tile(uint i, uint j)
 
       wait_on_mask(1 << TAG_TEXTURE_TILE);
 
-      tex_tile_x[pos] = tx;
-      tex_tile_y[pos] = ty;
+      tex_tile_xy[pos] = txty;
    }
    else {
 #if 0
@@ -130,21 +132,11 @@ get_tex_tile(uint i, uint j)
 uint
 sample_texture(vector float texcoord)
 {
-#if 0
-   /* wrap/repeat */
-   uint i = (uint) (spu_extract(texcoord, 0) * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (spu_extract(texcoord, 1) * spu.texture.height) % spu.texture.height;
-   uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
-   return texel;
-#else
    vector float tc = spu_mul(texcoord, spu.tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);
-   itc = spu_and(itc, spu.tex_size_mask);
-   uint i = spu_extract(itc, 0);
-   uint j = spu_extract(itc, 1);
-   uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
+   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
+   itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
+   uint pos = get_tex_tile(itc);
+   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
    return texel;
-#endif
 }
-- 
cgit v1.2.3


From 382651a4fafbea0c24e993933cb08a7ba19abcb7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 12:50:16 -0700
Subject: Cell: added spu_unpack_color(), spu_pack_R8G8B8A8()

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 9977a6ece0..0c93c06562 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -35,6 +35,17 @@
 #include <spu_intrinsics.h>
 
 
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  0, 4, 8, 12, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
 static INLINE unsigned int
 spu_pack_A8R8G8B8(vector float rgba)
 {
@@ -66,4 +77,18 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
 }
 
 
+static INLINE vector float
+spu_unpack_color(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          VEC_LITERAL(vector unsigned char,
+                                      0, 0, 0, 0,
+                                      5, 5, 5, 5,
+                                      10, 10, 10, 10,
+                                      15, 15, 15, 15));
+   return spu_convtf(color_u4, 32);
+}
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From e3ff185eae2f49f4dac92f7e89558ed175251c25 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 12:50:42 -0700
Subject: Cell: implement basic bilinear texture sampler

---
 src/mesa/pipe/cell/spu/spu_texture.c | 67 ++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_texture.h |  4 +++
 2 files changed, 71 insertions(+)

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index b52df970d0..26a5eefc48 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -32,6 +32,7 @@
 #include "spu_main.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+#include "spu_colorpack.h"
 
 
 /**
@@ -140,3 +141,69 @@ sample_texture(vector float texcoord)
    uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
    return texel;
 }
+
+
+uint
+sample_texture_bilinear(vector float texcoord)
+{
+   static const vector unsigned int offset10 = {1, 0, 0, 0};
+   static const vector unsigned int offset01 = {0, 1, 0, 0};
+
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   /* itcST */
+   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
+   vector unsigned int itc01 = spu_add(itc00, offset01);
+   vector unsigned int itc10 = spu_add(itc00, offset10);
+   vector unsigned int itc11 = spu_add(itc10, offset01);
+
+   itc00 = spu_and(itc00, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc01 = spu_and(itc01, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc10 = spu_and(itc10, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc11 = spu_and(itc11, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+
+   /* intra tile addr */
+   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
+   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
+   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
+   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
+
+   uint pos00 = get_tex_tile(itc00);
+   uint pos01 = get_tex_tile(itc01);
+   uint pos10 = get_tex_tile(itc10);
+   uint pos11 = get_tex_tile(itc11);
+
+   vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_color(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
+   vector signed int itc1024 = spu_convts(tc1024, 0);
+   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
+   vector float weight = spu_convtf(itc1024, 10);
+
+   /* smeared frac and 1-frac */
+   vector float sfrac = spu_splats(spu_extract(weight, 0));
+   vector float tfrac = spu_splats(spu_extract(weight, 1));
+   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
+   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
+
+   /* multiply the samples (colors) by the S/T weights */
+   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
+   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
+   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
+   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
+
+   /* compute sum of weighted samples */
+   vector float texel_sum = spu_add(texel00, texel01);
+   texel_sum = spu_add(texel_sum, texel10);
+   texel_sum = spu_add(texel_sum, texel11);
+
+   /* convert to uint color */
+   uint texel = spu_pack_R8G8B8A8(texel_sum);
+
+   return texel;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 5bc8e71879..25cbe9b3c6 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -40,4 +40,8 @@ extern uint
 sample_texture(vector float texcoord);
 
 
+extern uint
+sample_texture_bilinear(vector float texcoord);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 1bd182889b3dbb2f4c75d18184e7c76a5bfca248 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 13:16:10 -0700
Subject: Cell: improved bilinear filtering

avoid calling get_tex_tile() if all texels are in same tile
---
 src/mesa/pipe/cell/spu/spu_texture.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 26a5eefc48..6e243f7fa3 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -150,16 +150,17 @@ sample_texture_bilinear(vector float texcoord)
    static const vector unsigned int offset01 = {0, 1, 0, 0};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
-   /* itcST */
+   /* integer texcoords S,T: */
    vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
    vector unsigned int itc01 = spu_add(itc00, offset01);
    vector unsigned int itc10 = spu_add(itc00, offset10);
    vector unsigned int itc11 = spu_add(itc10, offset01);
 
-   itc00 = spu_and(itc00, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc01 = spu_and(itc01, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc10 = spu_and(itc10, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc11 = spu_and(itc11, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   /* mask (GL_REPEAT) */
+   itc00 = spu_and(itc00, spu.tex_size_mask);
+   itc01 = spu_and(itc01, spu.tex_size_mask);
+   itc10 = spu_and(itc10, spu.tex_size_mask);
+   itc11 = spu_and(itc11, spu.tex_size_mask);
 
    /* intra tile addr */
    vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
@@ -167,11 +168,21 @@ sample_texture_bilinear(vector float texcoord)
    vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
    vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
 
+   /* get tile cache positions */
    uint pos00 = get_tex_tile(itc00);
-   uint pos01 = get_tex_tile(itc01);
-   uint pos10 = get_tex_tile(itc10);
-   uint pos11 = get_tex_tile(itc11);
+   uint pos01, pos10, pos11;
+   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
+       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
+      /* all texels are in the same tile */
+      pos01 = pos10 = pos11 = pos00;
+   }
+   else {
+      pos01 = get_tex_tile(itc01);
+      pos10 = get_tex_tile(itc10);
+      pos11 = get_tex_tile(itc11);
+   }
 
+   /* get texels from tiles and convert to float[4] */
    vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
    vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
    vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-- 
cgit v1.2.3


From 4080fef4732078e8861eb0d26d1f6e43aa27dd9e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 13:23:07 -0700
Subject: Cell: choose bilinear vs. nearest filtering according to sampler
 state

---
 src/mesa/pipe/cell/spu/spu_main.c    | 4 ++++
 src/mesa/pipe/cell/spu/spu_main.h    | 2 ++
 src/mesa/pipe/cell/spu/spu_texture.c | 2 +-
 src/mesa/pipe/cell/spu/spu_texture.h | 2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     | 8 ++++----
 5 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 412661061a..48e016fc8b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -252,6 +252,10 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
              spu.init.id);
 
    memcpy(&spu.sampler[0], state, sizeof(*state));
+   if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture = sample_texture_bilinear;
+   else
+      spu.sample_texture = sample_texture_nearest;
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 02b62ee5cd..fb98b0d889 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -114,6 +114,8 @@ struct spu_global
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
 
+   uint (*sample_texture)(vector float texcoord);
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 6e243f7fa3..ecacf2ec88 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -131,7 +131,7 @@ get_tex_tile(vector unsigned int ij)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(vector float texcoord)
+sample_texture_nearest(vector float texcoord)
 {
    vector float tc = spu_mul(texcoord, spu.tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 25cbe9b3c6..0e000bfebf 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(vector float texcoord);
+sample_texture_nearest(vector float texcoord);
 
 
 extern uint
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index c148c75dd6..7b422f71a8 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -309,13 +309,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0].v);
+            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0].v);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1].v);
+            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1].v);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2].v);
+            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2].v);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3].v);
+            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3].v);
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From ab9b705c67d0d8c40949ac7e697a8b4ede666c50 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:06:10 -0700
Subject: Cell: emit blend state to SPUs

---
 src/mesa/pipe/cell/common.h              |  3 ++-
 src/mesa/pipe/cell/ppu/cell_state_emit.c |  6 ++++++
 src/mesa/pipe/cell/spu/spu_main.c        | 17 +++++++++++++++++
 src/mesa/pipe/cell/spu/spu_main.h        |  1 +
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 7e193f31be..d861e82d33 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -85,7 +85,8 @@
 #define CELL_CMD_STATE_VERTEX_INFO   14
 #define CELL_CMD_STATE_VIEWPORT      15
 #define CELL_CMD_STATE_VS_ARRAY_INFO 16
-#define CELL_CMD_VS_EXECUTE          17
+#define CELL_CMD_STATE_BLEND         17
+#define CELL_CMD_VS_EXECUTE          18
 
 
 #define CELL_NUM_BUFFERS 4
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 702184416b..3b2670f786 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -61,6 +61,12 @@ cell_emit_state(struct cell_context *cell)
       fb->height = cell->framebuffer.cbufs[0]->height;
    }
 
+   if (cell->dirty & CELL_NEW_BLEND) {
+      emit_state_cmd(cell, CELL_CMD_STATE_BLEND,
+                     cell->blend,
+                     sizeof(struct pipe_blend_state));
+   }
+
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
       emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
                      cell->depth_stencil,
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 48e016fc8b..9d8e6df0e3 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -232,6 +232,18 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+static void
+cmd_state_blend(const struct pipe_blend_state *state)
+{
+   if (Debug)
+      printf("SPU %u: BLEND: ztest %d\n",
+             spu.init.id,
+             state->blend_enable);
+
+   memcpy(&spu.blend, state, sizeof(*state));
+}
+
+
 static void
 cmd_state_depth_stencil(const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -398,6 +410,11 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_STATE_BLEND:
+         cmd_state_blend((struct pipe_blend_state *)
+                                 &buffer[pos+1]);
+         pos += (1 + sizeof(struct pipe_blend_state) / 4);
+         break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index fb98b0d889..b22d563551 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -86,6 +86,7 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
+   struct pipe_blend_state blend_stencil;
    struct pipe_depth_stencil_alpha_state depth_stencil;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From b4f5575add6ad4c8b0f960e10641f361dad74606 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:10:35 -0700
Subject: Cell: replace float 4 with vector float in eval_coeff()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 7b422f71a8..199afa1aa6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -32,6 +32,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
+#include "spu_blend.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -206,14 +207,14 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, float4 result[4])
+eval_coeff(uint slot, float x, float y, vector float result[4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
       result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
       break;
 
    case INTERP_LINEAR:
@@ -227,10 +228,10 @@ eval_coeff(uint slot, float x, float y, float4 result[4])
                       spu_add(spu_mul(spu_splats(x), dadx),
                               spu_mul(spu_splats(y), dady)));
 
-         result[QUAD_TOP_LEFT].v = topLeft;
-         result[QUAD_TOP_RIGHT].v = spu_add(topLeft, dadx);
-         result[QUAD_BOTTOM_LEFT].v = spu_add(topLeft, dady);
-         result[QUAD_BOTTOM_RIGHT].v = spu_add(spu_add(topLeft, dadx), dady);
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
       }
    }
 }
@@ -305,32 +306,32 @@ emit_quad( int x, int y, mask_t mask )
 
       if (spu.texture.start) {
          /* texture mapping */
-         float4 texcoords[4];
+         vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0].v);
+            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0]);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1].v);
+            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1]);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2].v);
+            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2]);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3].v);
+            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3]);
       }
       else {
          /* simple shading */
          const vector unsigned char shuffle = spu.color_shuffle;
-         float4 colors[4];
+         vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0].v, shuffle);
+            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1].v, shuffle);
+            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2].v, shuffle);
+            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3].v, shuffle);
+            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
       }
 
 #if 0
-- 
cgit v1.2.3


From 21461014b2446208fefae0aabe8232c66d5b3057 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:17:50 -0700
Subject: Cell: some basic blending code

---
 src/mesa/pipe/cell/spu/Makefile    |  1 +
 src/mesa/pipe/cell/spu/spu_blend.c | 62 ++++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_blend.h | 37 +++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_tri.c   |  5 +++
 4 files changed, 105 insertions(+)
 create mode 100644 src/mesa/pipe/cell/spu/spu_blend.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_blend.h

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 91a631b699..66f16cde9b 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -19,6 +19,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_blend.c \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
diff --git a/src/mesa/pipe/cell/spu/spu_blend.c b/src/mesa/pipe/cell/spu/spu_blend.c
new file mode 100644
index 0000000000..23ec0eeb45
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "spu_main.h"
+#include "spu_blend.h"
+#include "spu_colorpack.h"
+
+
+void
+blend_quad(uint itx, uint ity, vector float colors[4])
+{
+   /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */
+   vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]);
+   vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]);
+   vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]);
+   vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]);
+
+   vector float alpha00 = spu_splats(spu_extract(colors[0], 3));
+   vector float alpha01 = spu_splats(spu_extract(colors[1], 3));
+   vector float alpha10 = spu_splats(spu_extract(colors[2], 3));
+   vector float alpha11 = spu_splats(spu_extract(colors[3], 3));
+
+   vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00);
+   vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01);
+   vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10);
+   vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11);
+
+   colors[0] = spu_add(spu_mul(colors[0], alpha00),
+                       spu_mul(fbc00, one_minus_alpha00));
+   colors[1] = spu_add(spu_mul(colors[1], alpha01),
+                       spu_mul(fbc01, one_minus_alpha01));
+   colors[2] = spu_add(spu_mul(colors[2], alpha10),
+                       spu_mul(fbc10, one_minus_alpha10));
+   colors[3] = spu_add(spu_mul(colors[3], alpha11),
+                       spu_mul(fbc11, one_minus_alpha11));
+}
+
diff --git a/src/mesa/pipe/cell/spu/spu_blend.h b/src/mesa/pipe/cell/spu/spu_blend.h
new file mode 100644
index 0000000000..2b594b578b
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_BLEND_H
+#define SPU_BLEND_H
+
+
+extern void
+blend_quad(uint itx, uint ity, vector float colors[4]);
+
+
+#endif /* SPU_BLEND_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 199afa1aa6..89aaca9a72 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -324,6 +324,11 @@ emit_quad( int x, int y, mask_t mask )
          vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
 
+#if 0
+         if (spu.blend.blend_enable)
+            blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#endif
+
          if (spu_extract(mask, 0))
             spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
          if (spu_extract(mask, 1))
-- 
cgit v1.2.3


From 2f8268aa02949828b8b14c252e2cc4e8f61c5f4e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:04:50 -0700
Subject: Cell: fix typo

---
 src/mesa/pipe/cell/spu/spu_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 9d8e6df0e3..b0311db1aa 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -236,7 +236,7 @@ static void
 cmd_state_blend(const struct pipe_blend_state *state)
 {
    if (Debug)
-      printf("SPU %u: BLEND: ztest %d\n",
+      printf("SPU %u: BLEND: enabled %d\n",
              spu.init.id,
              state->blend_enable);
 
-- 
cgit v1.2.3


From 71e6cd0b66be784aa3feb86101b7a62d17735f56 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:05:13 -0700
Subject: Cell: added spu_unpack_A8R8G8B8()

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 0c93c06562..57ea3525c2 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -91,4 +91,19 @@ spu_unpack_color(uint color)
 }
 
 
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          VEC_LITERAL(vector unsigned char,
+                                      5, 5, 5, 5,
+                                      10, 10, 10, 10,
+                                      15, 15, 15, 15,
+                                      0, 0, 0, 0));
+
+   return spu_convtf(color_u4, 32);
+}
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From 790eec9666ae6cc37ce4ba54cceff97e9eeb5ce3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:06:51 -0700
Subject: Cell: texture sampler functions always return vector float now

Texture colors look the same now, regardless of X display/pixel format
---
 src/mesa/pipe/cell/spu/spu_main.h    |  2 +-
 src/mesa/pipe/cell/spu/spu_texture.c | 19 ++++++++-----------
 src/mesa/pipe/cell/spu/spu_texture.h |  4 ++--
 src/mesa/pipe/cell/spu/spu_tri.c     | 36 ++++++++++++++++++------------------
 4 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index b22d563551..cfd4d72729 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -115,7 +115,7 @@ struct spu_global
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
 
-   uint (*sample_texture)(vector float texcoord);
+   vector float (*sample_texture)(vector float texcoord);
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index ecacf2ec88..9ee2b45e24 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -130,7 +130,7 @@ get_tex_tile(vector unsigned int ij)
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
  */
-uint
+vector float
 sample_texture_nearest(vector float texcoord)
 {
    vector float tc = spu_mul(texcoord, spu.tex_size);
@@ -139,11 +139,11 @@ sample_texture_nearest(vector float texcoord)
    vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
    uint pos = get_tex_tile(itc);
    uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
-   return texel;
+   return spu_unpack_A8R8G8B8(texel);
 }
 
 
-uint
+vector float
 sample_texture_bilinear(vector float texcoord)
 {
    static const vector unsigned int offset10 = {1, 0, 0, 0};
@@ -183,10 +183,10 @@ sample_texture_bilinear(vector float texcoord)
    }
 
    /* get texels from tiles and convert to float[4] */
-   vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
-   vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
-   vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-   vector float texel11 = spu_unpack_color(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
@@ -213,8 +213,5 @@ sample_texture_bilinear(vector float texcoord)
    texel_sum = spu_add(texel_sum, texel10);
    texel_sum = spu_add(texel_sum, texel11);
 
-   /* convert to uint color */
-   uint texel = spu_pack_R8G8B8A8(texel_sum);
-
-   return texel;
+   return texel_sum;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 0e000bfebf..95eb87080f 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -36,11 +36,11 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern uint
+extern vector float
 sample_texture_nearest(vector float texcoord);
 
 
-extern uint
+extern vector float
 sample_texture_bilinear(vector float texcoord);
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 89aaca9a72..4c6de56eda 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -301,6 +301,8 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
+      const vector unsigned char shuffle = spu.color_shuffle;
+      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
@@ -310,34 +312,32 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0]);
+            colors[0] = spu.sample_texture(texcoords[0]);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1]);
+            colors[1] = spu.sample_texture(texcoords[1]);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2]);
+            colors[2] = spu.sample_texture(texcoords[2]);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3]);
+            colors[3] = spu.sample_texture(texcoords[3]);
       }
       else {
          /* simple shading */
-         const vector unsigned char shuffle = spu.color_shuffle;
-         vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
+      }
 
-#if 0
-         if (spu.blend.blend_enable)
-            blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#if 1
+      if (spu.blend.blend_enable)
+         blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
 #endif
 
-         if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
-         if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
-         if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
-         if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
-      }
+      if (spu_extract(mask, 0))
+         spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
+      if (spu_extract(mask, 1))
+         spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
+      if (spu_extract(mask, 2))
+         spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
+      if (spu_extract(mask, 3))
+         spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
 
 #if 0
       /* SIMD_Z with swizzled color buffer (someday) */
-- 
cgit v1.2.3


From d17e3362592c58f0d5d47745fd97b3b31d1a684a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:48:00 -0700
Subject: Cell: fix small sampling error in sample_texture_bilinear()

---
 src/mesa/pipe/cell/spu/spu_texture.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 9ee2b45e24..01ff33a857 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -150,6 +150,8 @@ sample_texture_bilinear(vector float texcoord)
    static const vector unsigned int offset01 = {0, 1, 0, 0};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
+   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+
    /* integer texcoords S,T: */
    vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
    vector unsigned int itc01 = spu_add(itc00, offset01);
-- 
cgit v1.2.3


From 4540e01978280389ed219aa0a4b4f39db280961f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:48:36 -0700
Subject: Cell: move float4 typedef (temporary datatype)

---
 src/mesa/pipe/cell/spu/spu_main.h | 7 -------
 src/mesa/pipe/cell/spu/spu_tri.c  | 6 ++++++
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index cfd4d72729..1710a17512 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -41,13 +41,6 @@
 #define MAX_HEIGHT 1024
 
 
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
-
-
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 4c6de56eda..688c8646ab 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -45,6 +45,12 @@
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
 
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
 
 /**
  * Simplified types taken from other parts of Gallium
-- 
cgit v1.2.3


From f603652c30c40f7f7948fbdc79a3479016d8073f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:02:21 -0700
Subject: Cell: don't use VEC_LITERAL macro, doesn't work w/ SDK 3.0

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 41 +++++++++++++++++-----------------
 src/mesa/pipe/cell/spu/spu_ztest.h     | 24 ++++++++++----------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 57ea3525c2..e9fee8a3a6 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -31,7 +31,6 @@
 #define SPU_COLORPACK_H
 
 
-#include <vec_literal.h>
 #include <spu_intrinsics.h>
 
 
@@ -39,9 +38,11 @@ static INLINE unsigned int
 spu_pack_R8G8B8A8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  0, 4, 8, 12, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
   return spu_extract(out, 0);
 }
 
@@ -50,9 +51,9 @@ static INLINE unsigned int
 spu_pack_A8R8G8B8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  12, 0, 4, 8, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
   return spu_extract(out, 0);
 }
 
@@ -61,9 +62,9 @@ static INLINE unsigned int
 spu_pack_B8G8R8A8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  8, 4, 0, 12, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
   return spu_extract(out, 0);
 }
 
@@ -82,11 +83,11 @@ spu_unpack_color(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
-                          VEC_LITERAL(vector unsigned char,
-                                      0, 0, 0, 0,
-                                      5, 5, 5, 5,
-                                      10, 10, 10, 10,
-                                      15, 15, 15, 15));
+                          ((vector unsigned char) {
+                             0, 0, 0, 0,
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -96,11 +97,11 @@ spu_unpack_A8R8G8B8(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
-                          VEC_LITERAL(vector unsigned char,
-                                      5, 5, 5, 5,
-                                      10, 10, 10, 10,
-                                      15, 15, 15, 15,
-                                      0, 0, 0, 0));
+                          ((vector unsigned char) {
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15,
+                             0, 0, 0, 0}) );
 
    return spu_convtf(color_u4, 32);
 }
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
index 5fefb15176..ce8ad00339 100644
--- a/src/mesa/pipe/cell/spu/spu_ztest.h
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -68,9 +68,9 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* gather lower four ushorts */
       zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
                              (vector unsigned int) *zbuf,
-                             VEC_LITERAL(vector unsigned char,
-                                      ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
-                                      ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+                             ((vector unsigned char) {
+                                ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
       /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
       mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
       /* mask &= inMask */
@@ -80,18 +80,18 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
       *zbuf = (vector unsigned short)
          spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     VEC_LITERAL(vector unsigned char,
-                                 16, 17, 18, 19, 20, 21, 22, 23,
-                                 2, 3, 6, 7, 10, 11, 14, 15));
+                     ((vector unsigned char) {
+                        16, 17, 18, 19, 20, 21, 22, 23,
+                        2, 3, 6, 7, 10, 11, 14, 15}));
    }
    else {
       /* convert zbuffer values from ushorts to uints */
       /* gather upper four ushorts */
       zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
                              (vector unsigned int) *zbuf,
-                             VEC_LITERAL(vector unsigned char,
-                                         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-                                         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+                             ((vector unsigned char) {
+                                ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
       /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
       mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
       /* mask &= inMask */
@@ -101,9 +101,9 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
       *zbuf = (vector unsigned short)
          spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     VEC_LITERAL(vector unsigned char,
-                                 2, 3, 6, 7, 10, 11, 14, 15,
-                                 24, 25, 26, 27, 28, 29, 30, 31));
+                     ((vector unsigned char) {
+                        2, 3, 6, 7, 10, 11, 14, 15,
+                        24, 25, 26, 27, 28, 29, 30, 31}));
    }
    return mask;
 #undef ZERO
-- 
cgit v1.2.3


From 6a3f1ea91d3d8e4c47144cda422db9db761be94d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:03:05 -0700
Subject: Cell: don't use VEC_LITERAL macro, doesn't work w/ SDK 3.0

---
 src/mesa/pipe/cell/spu/spu_texture.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 01ff33a857..3962aaa4a9 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -26,8 +26,6 @@
  **************************************************************************/
 
 
-#include <vec_literal.h>
-
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -57,7 +55,7 @@ invalidate_tex_cache(void)
    /* XXX memset? */
    uint i;
    for (i = 0; i < CACHE_SIZE; i++) {
-      tex_tile_xy[i] = VEC_LITERAL(vector unsigned int, ~0U, ~0U, ~0U, ~0U);
+      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
    }
 }
 
-- 
cgit v1.2.3


From 535abe4037920960b37a23392142cc556d4cbcc4 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:05:37 -0700
Subject: Cell: fix some alignment issues by aligning commands to 8-byte
 boundaries

Contributed by Ian Romanick.
Also, temporarily disable inlined vertex buffers.  They need to be 16-byte
aligned...
---
 src/mesa/pipe/cell/common.h                 | 16 ++++----
 src/mesa/pipe/cell/ppu/cell_batch.c         |  4 +-
 src/mesa/pipe/cell/ppu/cell_flush.c         |  2 +-
 src/mesa/pipe/cell/ppu/cell_state_emit.c    |  3 +-
 src/mesa/pipe/cell/ppu/cell_vbuf.c          |  4 +-
 src/mesa/pipe/cell/ppu/cell_vertex_shader.c | 22 ++++++-----
 src/mesa/pipe/cell/spu/spu_main.c           | 58 +++++++++++++----------------
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c   |  7 ++--
 src/mesa/pipe/cell/spu/spu_vertex_shader.h  |  2 +-
 9 files changed, 57 insertions(+), 61 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d861e82d33..cf8fc94ebf 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -57,6 +57,9 @@
 /** round up value to next multiple of 4 */
 #define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
 
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
 /** round up value to next multiple of 16 */
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
@@ -102,7 +105,7 @@
  */
 struct cell_command_framebuffer
 {
-   uint opcode;
+   uint64_t opcode;
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -114,7 +117,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint opcode;
+   uint64_t opcode;
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
 };
@@ -125,8 +128,7 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint opcode;
-    uint base;          /**< Base address of the 0th element. */
+    uint64_t base;          /**< Base address of the 0th element. */
     uint attr;          /**< Attribute that this state if for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
@@ -150,7 +152,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    struct cell_shader_info   shader;
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -163,7 +165,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint opcode;       /**< CELL_CMD_RENDER */
+   uint64_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -179,7 +181,7 @@ struct cell_command_render
 
 struct cell_command_release_verts
 {
-   int opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 2d032fc902..2fb49711b2 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -136,7 +136,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
    ASSERT(cell->cur_batch >= 0);
 
@@ -171,7 +171,7 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
    void *pos;
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
 
    assert(cell->cur_batch >= 0);
diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index cf4e676645..f62bc4650c 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -59,7 +59,7 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags)
    flushing = TRUE;
 
    if (flags & PIPE_FLUSH_WAIT) {
-      uint *cmd = (uint *) cell_batch_alloc(cell, sizeof(uint));
+      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
       *cmd = CELL_CMD_FINISH;
    }
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 3b2670f786..5d2a786449 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -37,7 +37,8 @@ static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint *dst = (uint *) cell_batch_alloc(cell, sizeof(uint) + state_size);
+   uint64_t *dst = (uint64_t *) 
+       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
    *dst = cmd;
    memcpy(dst + 1, state, state_size);
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index e63b34cf52..0fee61821a 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,7 +40,7 @@
 
 
 /** Allow vertex data to be inlined after RENDER command */
-#define ALLOW_INLINE_VERTS 1
+#define ALLOW_INLINE_VERTS 0
 
 
 /**
@@ -197,7 +197,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP4(nr_indices * 2);
+      const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
 
       const uint batch_size = sizeof(struct cell_command_render)
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
index aef329a902..80dd500b34 100644
--- a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -52,8 +52,8 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
    struct cell_command_vs *const vs = &cell_global.command[0].vs;
-   unsigned *batch;
-   struct cell_array_info array_info;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
    unsigned i, j;
 
    assert(draw->vs.queue_nr != 0);
@@ -63,17 +63,19 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    draw_update_vertex_fetch(draw);
 
    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
-      array_info.opcode = CELL_CMD_STATE_VS_ARRAY_INFO;
-      assert(draw->vertex_fetch.src_ptr[i] != NULL);
-      array_info.base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
-      array_info.attr = i;
-      array_info.pitch = draw->vertex_fetch.pitch[i];
-      array_info.format = draw->vertex_element[i].src_format;
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
 
-      cell_batch_append(cell, & array_info, sizeof(array_info));
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->format = draw->vertex_element[i].src_format;
    }
 
-   batch = cell_batch_alloc(cell, sizeof(unsigned)
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
                             + sizeof(struct pipe_viewport_state));
    batch[0] = CELL_CMD_STATE_VIEWPORT;
    (void) memcpy(&batch[1], &draw->viewport,
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index b0311db1aa..4f126d5e5b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,7 +31,6 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <vec_literal.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -220,13 +219,13 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 0;
 
    if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      12, 0, 4, 8, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              12, 0, 4, 8, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      8, 4, 0, 12, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              8, 4, 0, 12, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else
       ASSERT(0);
 }
@@ -279,16 +278,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
              spu.init.id, texture->start, texture->width, texture->height);
 
    memcpy(&spu.texture, texture, sizeof(*texture));
-   spu.tex_size = VEC_LITERAL(vector float,
-                              spu.texture.width,
-                              spu.texture.height,
-                              0.0,
-                              0.0);
-   spu.tex_size_mask = VEC_LITERAL(vector unsigned int,
-                                   spu.texture.width - 1,
-                                   spu.texture.height - 1,
-                                   0,
-                                   0);
+   spu.tex_size = (vector float)
+      { spu.texture.width, spu.texture.height, 0.0, 0.0};
+   spu.tex_size_mask = (vector unsigned int)
+      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
 }
 
 
@@ -341,8 +334,8 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
-   const uint usize = size / sizeof(uint);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
    if (Debug)
@@ -377,7 +370,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 4;
+            pos += sizeof(*fb) / 8;
          }
          break;
       case CELL_CMD_CLEAR_SURFACE:
@@ -385,7 +378,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 4;
+            pos += sizeof(*clr) / 8;
          }
          break;
       case CELL_CMD_RENDER:
@@ -394,7 +387,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 4 + pos_incr;
+            pos += sizeof(*render) / 8 + ((pos_incr + 1) / 2);
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -402,8 +395,7 @@ cmd_batch(uint opcode)
             struct cell_command_release_verts *release
                = (struct cell_command_release_verts *) &buffer[pos];
             cmd_release_verts(release);
-            ASSERT(sizeof(*release) == 8);
-            pos += sizeof(*release) / 4;
+            pos += sizeof(*release) / 8;
          }
          break;
       case CELL_CMD_FINISH:
@@ -413,36 +405,36 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_BLEND:
          cmd_state_blend((struct pipe_blend_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_blend_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_depth_stencil_alpha_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_sampler_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
          break;
       case CELL_CMD_STATE_TEXTURE:
          cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_command_texture) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct vertex_info) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
          break;
       case CELL_CMD_STATE_VIEWPORT:
          (void) memcpy(& draw.viewport, &buffer[pos+1],
                        sizeof(struct pipe_viewport_state));
-         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos]);
-         pos += (sizeof(struct cell_array_info) / 4);
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       default:
-         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
          break;
       }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 1e846868e3..5b0f2a6470 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -431,9 +431,8 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
    for (attr = 0; attr < nr_attrs; attr++) {
-
-      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
-      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
       float p[4][4];
@@ -447,7 +446,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        */
       for (i = 0; i < count; i++) {
          uint8_t buffer[32] ALIGN16_ATTRIB;
-         const unsigned long addr = src + (elts[i] * pitch);
+         const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index c52f38fd02..b261ab44a2 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -16,7 +16,7 @@ struct spu_vs_context {
    struct pipe_viewport_state viewport;
 
    struct {
-      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      uint64_t src_ptr[PIPE_ATTRIB_MAX];
       unsigned pitch[PIPE_ATTRIB_MAX];
       enum pipe_format format[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
-- 
cgit v1.2.3


From 7a1b2f4078789aedf16158c41682c9d28a531d20 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Tue, 5 Feb 2008 07:50:56 -0700
Subject: gallium: Use align_free to free aligned memory.

---
 src/mesa/pipe/draw/draw_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index 87f4969983..b15f57c824 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -106,7 +106,7 @@ void draw_destroy( struct draw_context *draw )
    if (draw->pipeline.rasterize)
       draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
    tgsi_exec_machine_free_data(&draw->machine);
-   FREE( draw->vcache.vertex[0] ); /* Frees all the vertices. */
+   align_free( draw->vcache.vertex[0] ); /* Frees all the vertices. */
    FREE( draw );
 }
 
-- 
cgit v1.2.3


From e9147bfab40d26fb8f8c0794e9a3fdcf14ca57dd Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Wed, 6 Feb 2008 00:26:49 +1100
Subject: nv40: cleanup state handling a bit

---
 src/mesa/pipe/nouveau/nouveau_stateobj.h | 139 +++++++++
 src/mesa/pipe/nv40/nv40_context.h        |  18 +-
 src/mesa/pipe/nv40/nv40_fragprog.c       |  18 +-
 src/mesa/pipe/nv40/nv40_fragtex.c        |  38 +--
 src/mesa/pipe/nv40/nv40_state.c          | 464 ++++++++++++++++---------------
 src/mesa/pipe/nv40/nv40_state.h          |  68 +----
 src/mesa/pipe/nv40/nv40_state_emit.c     |  70 +----
 src/mesa/pipe/nv40/nv40_vbo.c            |  81 +++---
 8 files changed, 462 insertions(+), 434 deletions(-)
 create mode 100644 src/mesa/pipe/nouveau/nouveau_stateobj.h

diff --git a/src/mesa/pipe/nouveau/nouveau_stateobj.h b/src/mesa/pipe/nouveau/nouveau_stateobj.h
new file mode 100644
index 0000000000..8dfc0e9e9a
--- /dev/null
+++ b/src/mesa/pipe/nouveau/nouveau_stateobj.h
@@ -0,0 +1,139 @@
+#ifndef __NOUVEAU_STATEOBJ_H__
+#define __NOUVEAU_STATEOBJ_H__
+
+struct nouveau_stateobj_reloc {
+	struct pipe_buffer *bo;
+
+	unsigned offset;
+	unsigned packet;
+
+	unsigned data;
+	unsigned flags;
+	unsigned vor;
+	unsigned tor;
+};
+
+struct nouveau_stateobj {
+	int refcount;
+
+	unsigned *push;
+	struct nouveau_stateobj_reloc *reloc;
+
+	unsigned *cur;
+	unsigned cur_packet;
+	unsigned cur_reloc;
+};
+
+static inline struct nouveau_stateobj *
+so_new(unsigned push, unsigned reloc)
+{
+	struct nouveau_stateobj *so;
+
+	so = malloc(sizeof(struct nouveau_stateobj));
+	so->refcount = 0;
+	so->push = malloc(sizeof(unsigned) * push);
+	so->reloc = malloc(sizeof(struct nouveau_stateobj_reloc) * reloc);
+
+	so->cur = so->push;
+	so->cur_reloc = so->cur_packet = 0;
+
+	return so;
+}
+
+static inline void
+so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
+{
+	struct nouveau_stateobj *so;
+
+	so = *pso;
+	if (so) {
+		if (--so->refcount <= 0) {
+			free(so->push);
+			free(so->reloc);
+			free(so);
+		}
+		*pso = NULL;
+	}
+
+	if (ref) {
+		ref->refcount++;
+		*pso = ref;
+	}
+}
+
+static inline void
+so_data(struct nouveau_stateobj *so, unsigned data)
+{
+	(*so->cur++) = (data);
+	so->cur_packet += 4;
+}
+
+static inline void
+so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
+	  unsigned mthd, unsigned size)
+{
+	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
+	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+}
+
+static inline void
+so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
+	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
+	
+	r->bo = bo;
+	r->offset = so->cur - so->push;
+	r->packet = so->cur_packet;
+	r->data = data;
+	r->flags = flags;
+	r->vor = vor;
+	r->tor = tor;
+	so_data(so, data);
+}
+
+static inline void
+so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned nr, i;
+
+	nr = so->cur - so->push;
+	if (pb->remaining < nr)
+		nvws->push_flush(nvws->channel, nr);
+	pb->remaining -= nr;
+
+	memcpy(pb->cur, so->push, nr * 4);
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws->channel, pb->cur + r->offset, r->bo,
+				 r->data, r->flags, r->vor, r->tor);
+	}
+	pb->cur += nr;
+}
+
+static inline void
+so_emit_reloc_markers(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned i;
+
+	i = so->cur_reloc << 1;
+	if (nvws->channel->pushbuf->remaining < i)
+		nvws->push_flush(nvws->channel, i);
+	nvws->channel->pushbuf->remaining -= i;
+
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws->channel, pb->cur++, r->bo, r->packet,
+				 (r->flags &
+				  (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART)) |
+				 NOUVEAU_BO_DUMMY, 0, 0);
+		nvws->push_reloc(nvws->channel, pb->cur++, r->bo, r->data,
+				 r->flags | NOUVEAU_BO_DUMMY, r->vor, r->tor);
+	}
+}
+
+#endif
diff --git a/src/mesa/pipe/nv40/nv40_context.h b/src/mesa/pipe/nv40/nv40_context.h
index 2d069619d7..1a31f00ad6 100644
--- a/src/mesa/pipe/nv40/nv40_context.h
+++ b/src/mesa/pipe/nv40/nv40_context.h
@@ -13,6 +13,7 @@
 #define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
 	struct nv40_context *ctx = nv40
 #include "pipe/nouveau/nouveau_push.h"
+#include "pipe/nouveau/nouveau_stateobj.h"
 
 #include "nv40_state.h"
 
@@ -47,20 +48,9 @@ struct nv40_context {
 	unsigned fp_samplers;
 	unsigned vp_samplers;
 
-	uint32_t rt_enable;
-	struct pipe_buffer *rt[4];
-	struct pipe_buffer *zeta;
-
-	struct {
-		struct pipe_buffer *buffer;
-		uint32_t format;
-	} tex[16];
-
-	unsigned vb_enable;
-	struct {
-		struct pipe_buffer *buffer;
-		unsigned delta;
-	} vb[16];
+	struct nouveau_stateobj *so_framebuffer;
+	struct nouveau_stateobj *so_fragtex[16];
+	struct nouveau_stateobj *so_vtxbuf;
 
 	struct {
 		struct nouveau_resource *exec_heap;
diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
index 667eb89cb2..e650c97541 100644
--- a/src/mesa/pipe/nv40/nv40_fragprog.c
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -759,6 +759,7 @@ void
 nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp)
 {
 	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_stateobj *so;
 	int i;
 
 	if (!fp->translated) {
@@ -815,13 +816,16 @@ nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp)
 		fp->on_hw = TRUE;
 	}
 
-	BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1);
-	OUT_RELOC (fp->buffer, 0, NOUVEAU_BO_VRAM |
-	           NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
-		   NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0,
-		   NV40TCL_FP_ADDRESS_DMA1);
-	BEGIN_RING(curie, NV40TCL_FP_CONTROL, 1);
-	OUT_RING  (fp->fp_control);
+	so = so_new(4, 1);
+	so_method(so, nv40->curie, NV40TCL_FP_ADDRESS, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_method(so, nv40->curie, NV40TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+
+	so_emit(nv40->nvws, so);
+	so_ref(so, &fp->so);
 
 	nv40->fragprog.active = fp;
 }
diff --git a/src/mesa/pipe/nv40/nv40_fragtex.c b/src/mesa/pipe/nv40/nv40_fragtex.c
index 7c5ecd5c56..283d49704a 100644
--- a/src/mesa/pipe/nv40/nv40_fragtex.c
+++ b/src/mesa/pipe/nv40/nv40_fragtex.c
@@ -59,8 +59,10 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 	struct nv40_miptree *nv40mt = nv40->tex_miptree[unit];
 	struct pipe_texture *pt = &nv40mt->base;
 	struct nv40_texture_format *tf;
+	struct nouveau_stateobj *so;
 	uint32_t txf, txs, txp;
 	int swizzled = 0; /*XXX: implement in region code? */
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 
 	tf = nv40_fragtex_format(pt->format);
 	if (!tf)
@@ -101,25 +103,24 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 
 	txs = tf->swizzle;
 
-	nv40->tex[unit].buffer = nv40mt->buffer;
-	nv40->tex[unit].format = txf;
-
-	BEGIN_RING(curie, NV40TCL_TEX_OFFSET(unit), 8);
-	OUT_RELOCl(nv40->tex[unit].buffer, 0, NOUVEAU_BO_VRAM |
-		   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv40->tex[unit].buffer, nv40->tex[unit].format,
-		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
-		   NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
-		   NV40TCL_TEX_FORMAT_DMA1);
-	OUT_RING  (ps->wrap);
-	OUT_RING  (NV40TCL_TEX_ENABLE_ENABLE | ps->en |
+	so = so_new(16, 2);
+	so_method(so, nv40->curie, NV40TCL_TEX_OFFSET(unit), 8);
+	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en |
 		   (0x00078000) /* mipmap related? */);
-	OUT_RING  (txs);
-	OUT_RING  (ps->filt | 0x3fd6 /*voodoo*/);
-	OUT_RING  ((pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) | pt->height[0]);
-	OUT_RING  (ps->bcol);
-	BEGIN_RING(curie, NV40TCL_TEX_SIZE1(unit), 1);
-	OUT_RING  ((pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | 0x3fd6 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+	so_method(so, nv40->curie, NV40TCL_TEX_SIZE1(unit), 1);
+	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	so_emit(nv40->nvws, so);
+	so_ref (so, &nv40->so_fragtex[unit]);
 }
 
 void
@@ -133,6 +134,7 @@ nv40_fragtex_bind(struct nv40_context *nv40)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
+		so_ref(NULL, &nv40->so_fragtex[unit]);
 		BEGIN_RING(curie, NV40TCL_TEX_ENABLE(unit), 1);
 		OUT_RING  (0);
 	}
diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c
index bb435b106b..125134afdc 100644
--- a/src/mesa/pipe/nv40/nv40_state.c
+++ b/src/mesa/pipe/nv40/nv40_state.c
@@ -9,59 +9,59 @@ static void *
 nv40_blend_state_create(struct pipe_context *pipe,
 			const struct pipe_blend_state *cso)
 {
-	struct nv40_blend_state *cb;
-
-	cb = malloc(sizeof(struct nv40_blend_state));
-
-	cb->b_enable = cso->blend_enable ? 1 : 0;
-	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
-			 (nvgl_blend_func(cso->rgb_src_factor)));
-	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
-			 (nvgl_blend_func(cso->rgb_dst_factor)));
-	cb->b_eqn = ((nvgl_blend_eqn(cso->alpha_func) << 16) |
-		     (nvgl_blend_eqn(cso->rgb_func)));
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, nv40->curie, NV40TCL_BLEND_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, nv40->curie, NV40TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, nv40->curie, NV40TCL_BLEND_ENABLE, 1);
+		so_data  (so, 0);
+	}
 
-	cb->l_enable = cso->logicop_enable ? 1 : 0;
-	cb->l_op = nvgl_logicop_func(cso->logicop_func);
+	so_method(so, nv40->curie, NV40TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
 
-	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
-		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
-		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
-		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+	if (cso->logicop_enable) {
+		so_method(so, nv40->curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, nv40->curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
 
-	cb->d_enable = cso->dither ? 1 : 0;
+	so_method(so, nv40->curie, NV40TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
 
-	return (void *)cb;
+	return (void *)so;
 }
 
 static void
 nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nv40_blend_state *cb = hwcso;
 
-	BEGIN_RING(curie, NV40TCL_DITHER_ENABLE, 1);
-	OUT_RING  (cb->d_enable);
-
-	BEGIN_RING(curie, NV40TCL_BLEND_ENABLE, 3);
-	OUT_RING  (cb->b_enable);
-	OUT_RING  (cb->b_srcfunc);
-	OUT_RING  (cb->b_dstfunc);
-	BEGIN_RING(curie, NV40TCL_BLEND_EQUATION, 1);
-	OUT_RING  (cb->b_eqn);
-
-	BEGIN_RING(curie, NV40TCL_COLOR_MASK, 1);
-	OUT_RING  (cb->c_mask);
-
-	BEGIN_RING(curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
-	OUT_RING  (cb->l_enable);
-	OUT_RING  (cb->l_op);
+	so_emit(nv40->nvws, hwcso);
 }
 
 static void
 nv40_blend_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-	free(hwcso);
+	struct nouveau_stateobj *so = hwcso;
+
+	so_ref(NULL, &so);
 }
 
 
@@ -261,8 +261,8 @@ static void *
 nv40_rasterizer_state_create(struct pipe_context *pipe,
 			     const struct pipe_rasterizer_state *cso)
 {
-	struct nv40_rasterizer_state *rs;
-	int i;
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_stateobj *so = so_new(32, 0);
 
 	/*XXX: ignored:
 	 * 	light_twoside
@@ -272,165 +272,163 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 	 * 	multisample
 	 * 	offset_units / offset_scale
 	 */
-	rs = malloc(sizeof(struct nv40_rasterizer_state));
-
-	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
 
-	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
-	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
-	rs->line_stipple_en = cso->line_stipple_enable ? 1 : 0;
-	rs->line_stipple = (cso->line_stipple_pattern << 16) |
-			    cso->line_stipple_factor;
+	so_method(so, nv40->curie, NV40TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV40TCL_SHADE_MODEL_FLAT :
+				       NV40TCL_SHADE_MODEL_SMOOTH);
 
-	rs->point_size = *(uint32_t*)&cso->point_size;
+	so_method(so, nv40->curie, NV40TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, nv40->curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
 
-	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
-	rs->poly_stipple_en = cso->poly_stipple_enable ? 1 : 0;
+	so_method(so, nv40->curie, NV40TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
 
+	so_method(so, nv40->curie, NV40TCL_POLYGON_MODE_FRONT, 6);
 	if (cso->front_winding == PIPE_WINDING_CCW) {
-		rs->front_face = NV40TCL_FRONT_FACE_CCW;
-		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
-		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, 0);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CCW);
 	} else {
-		rs->front_face = NV40TCL_FRONT_FACE_CW;
-		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
-		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, 0);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CW);
 	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, cso->cull_mode != PIPE_WINDING_NONE ? 1 : 0);
 
-	switch (cso->cull_mode) {
-	case PIPE_WINDING_CCW:
-		rs->cull_face_en = 1;
-		if (cso->front_winding == PIPE_WINDING_CCW)
-			rs->cull_face    = NV40TCL_CULL_FACE_FRONT;
-		else
-			rs->cull_face    = NV40TCL_CULL_FACE_BACK;
-		break;
-	case PIPE_WINDING_CW:
-		rs->cull_face_en = 1;
-		if (cso->front_winding == PIPE_WINDING_CW)
-			rs->cull_face    = NV40TCL_CULL_FACE_FRONT;
-		else
-			rs->cull_face    = NV40TCL_CULL_FACE_BACK;
-		break;
-	case PIPE_WINDING_BOTH:
-		rs->cull_face_en = 1;
-		rs->cull_face    = NV40TCL_CULL_FACE_FRONT_AND_BACK;
-		break;
-	case PIPE_WINDING_NONE:
-	default:
-		rs->cull_face_en = 0;
-		rs->cull_face    = 0;
-		break;
-	}
+	so_method(so, nv40->curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
 
+	so_method(so, nv40->curie, NV40TCL_POINT_SPRITE, 1);
 	if (cso->point_sprite) {
-		rs->point_sprite = (1 << 0);
+		unsigned psctl = (1 << 0), i;
+
 		for (i = 0; i < 8; i++) {
 			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
-				rs->point_sprite |= (1 << (8 + i));
+				psctl |= (1 << (8 + i));
 		}
+
+		so_data(so, psctl);
 	} else {
-		rs->point_sprite = 0;
+		so_data(so, 0);
 	}
 
-	return (void *)rs;
+	return (void *)so;
 }
 
 static void
 nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nv40_rasterizer_state *rs = hwcso;
-
-	BEGIN_RING(curie, NV40TCL_SHADE_MODEL, 1);
-	OUT_RING  (rs->shade_model);
-
-	BEGIN_RING(curie, NV40TCL_LINE_WIDTH, 2);
-	OUT_RING  (rs->line_width);
-	OUT_RING  (rs->line_smooth_en);
-	BEGIN_RING(curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
-	OUT_RING  (rs->line_stipple_en);
-	OUT_RING  (rs->line_stipple);
-
-	BEGIN_RING(curie, NV40TCL_POINT_SIZE, 1);
-	OUT_RING  (rs->point_size);
-
-	BEGIN_RING(curie, NV40TCL_POLYGON_MODE_FRONT, 6);
-	OUT_RING  (rs->poly_mode_front);
-	OUT_RING  (rs->poly_mode_back);
-	OUT_RING  (rs->cull_face);
-	OUT_RING  (rs->front_face);
-	OUT_RING  (rs->poly_smooth_en);
-	OUT_RING  (rs->cull_face_en);
-
-	BEGIN_RING(curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
-	OUT_RING  (rs->poly_stipple_en);
-
-	BEGIN_RING(curie, NV40TCL_POINT_SPRITE, 1);
-	OUT_RING  (rs->point_sprite);
+
+	so_emit(nv40->nvws, hwcso);
 }
 
 static void
 nv40_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-	free(hwcso);
-}
+	struct nouveau_stateobj *so = hwcso;
 
-static void
-nv40_translate_stencil(const struct pipe_depth_stencil_alpha_state *cso,
-		       unsigned idx, struct nv40_stencil_push *hw)
-{
-	hw->enable = cso->stencil[idx].enabled ? 1 : 0;
-	hw->wmask = cso->stencil[idx].write_mask;
-	hw->func = nvgl_comparison_op(cso->stencil[idx].func);
-	hw->ref	= cso->stencil[idx].ref_value;
-	hw->vmask = cso->stencil[idx].value_mask;
-	hw->fail = nvgl_stencil_op(cso->stencil[idx].fail_op);
-	hw->zfail = nvgl_stencil_op(cso->stencil[idx].zfail_op);
-	hw->zpass = nvgl_stencil_op(cso->stencil[idx].zpass_op);
+	so_ref(NULL, &so);
 }
 
 static void *
 nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 			const struct pipe_depth_stencil_alpha_state *cso)
 {
-	struct nv40_depth_stencil_alpha_state *hw;
-
-	hw = malloc(sizeof(struct nv40_depth_stencil_alpha_state));
-
-	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
-	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
-	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
-
-	nv40_translate_stencil(cso, 0, &hw->stencil.front);
-	nv40_translate_stencil(cso, 1, &hw->stencil.back);
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_stateobj *so = so_new(32, 0);
+
+	so_method(so, nv40->curie, NV40TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, nv40->curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, nv40->curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, nv40->curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
 
-	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
-	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
-	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref);
+	if (cso->stencil[1].enabled) {
+		so_method(so, nv40->curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, nv40->curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
 
-	return (void *)hw;
+	return (void *)so;
 }
 
 static void
 nv40_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nv40_depth_stencil_alpha_state *hw = hwcso;
-
-	BEGIN_RING(curie, NV40TCL_DEPTH_FUNC, 3);
-	OUT_RINGp ((uint32_t *)&hw->depth, 3);
-	BEGIN_RING(curie, NV40TCL_STENCIL_FRONT_ENABLE, 16);
-	OUT_RINGp ((uint32_t *)&hw->stencil.front, 8);
-	OUT_RINGp ((uint32_t *)&hw->stencil.back, 8);
-	BEGIN_RING(curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
-	OUT_RINGp ((uint32_t *)&hw->alpha.enabled, 3);
+
+	so_emit(nv40->nvws, hwcso);
 }
 
 static void
 nv40_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-	free(hwcso);
+	struct nouveau_stateobj *so = hwcso;
+
+	so_ref(NULL, &so);
 }
 
 static void *
@@ -502,12 +500,16 @@ nv40_set_blend_color(struct pipe_context *pipe,
 		     const struct pipe_blend_color *bcol)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_stateobj *so = so_new(2, 0);
+
+	so_method(so, nv40->curie, NV40TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
 
-	BEGIN_RING(curie, NV40TCL_BLEND_COLOR, 1);
-	OUT_RING  ((float_to_ubyte(bcol->color[3]) << 24) |
-		   (float_to_ubyte(bcol->color[0]) << 16) |
-		   (float_to_ubyte(bcol->color[1]) <<  8) |
-		   (float_to_ubyte(bcol->color[2]) <<  0));
+	so_emit(nv40->nvws, so);
+	so_ref(NULL, &so);
 }
 
 static void
@@ -540,6 +542,8 @@ nv40_set_framebuffer_state(struct pipe_context *pipe,
 	struct pipe_surface *rt[4], *zeta;
 	uint32_t rt_enable, rt_format, w, h;
 	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 
 	rt_enable = 0;
 	for (i = 0; i < 4; i++) {
@@ -603,66 +607,78 @@ nv40_set_framebuffer_state(struct pipe_context *pipe,
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-		nv40->rt[0] = rt[0]->buffer;
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1);
-		OUT_RELOCo(nv40->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 2);
-		OUT_RING  (rt[0]->pitch * rt[0]->cpp);
-		OUT_RELOCl(nv40->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		so_method(so, nv40->curie, NV40TCL_DMA_COLOR0, 1);
+		so_reloc (so, rt[0]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_data  (so, rt[0]->pitch * rt[0]->cpp);
+		so_reloc (so, rt[0]->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-		nv40->rt[1] = rt[1]->buffer;
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
-		OUT_RELOCo(nv40->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 2);
-		OUT_RELOCl(nv40->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		OUT_RING  (rt[1]->pitch * rt[1]->cpp);
+		so_method(so, nv40->curie, NV40TCL_DMA_COLOR1, 1);
+		so_reloc (so, rt[1]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, rt[1]->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->pitch * rt[1]->cpp);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
-		nv40->rt[2] = rt[2]->buffer;
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1);
-		OUT_RELOCo(nv40->rt[2], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[2], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR2_PITCH, 1);
-		OUT_RING  (rt[2]->pitch * rt[2]->cpp);
+		so_method(so, nv40->curie, NV40TCL_DMA_COLOR2, 1);
+		so_reloc (so, rt[2]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_reloc (so, rt[2]->buffer, rt[2]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_data  (so, rt[2]->pitch * rt[2]->cpp);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
-		nv40->rt[3] = rt[3]->buffer;
-		BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1);
-		OUT_RELOCo(nv40->rt[3], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[3], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_COLOR3_PITCH, 1);
-		OUT_RING  (rt[3]->pitch * rt[3]->cpp);
+		so_method(so, nv40->curie, NV40TCL_DMA_COLOR3, 1);
+		so_reloc (so, rt[3]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_reloc (so, rt[3]->buffer, rt[3]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_data  (so, rt[3]->pitch * rt[3]->cpp);
 	}
 
 	if (zeta_format) {
-		nv40->zeta = zeta->buffer;
-		BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1);
-		OUT_RELOCo(nv40->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv40->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(curie, NV40TCL_ZETA_PITCH, 1);
-		OUT_RING  (zeta->pitch * zeta->cpp);
+		so_method(so, nv40->curie, NV40TCL_DMA_ZETA, 1);
+		so_reloc (so, zeta->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->curie, NV40TCL_ZETA_OFFSET, 1);
+		so_reloc (so, zeta->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->curie, NV40TCL_ZETA_PITCH, 1);
+		so_data  (so, zeta->pitch * zeta->cpp);
 	}
 
-	nv40->rt_enable = rt_enable;
-	BEGIN_RING(curie, NV40TCL_RT_ENABLE, 1);
-	OUT_RING  (rt_enable);
-	BEGIN_RING(curie, NV40TCL_RT_HORIZ, 3);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0);
-	OUT_RING  (rt_format);
-	BEGIN_RING(curie, NV40TCL_VIEWPORT_HORIZ, 2);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0);
-	BEGIN_RING(curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
-	OUT_RING  (((w - 1) << 16) | 0);
-	OUT_RING  (((h - 1) << 16) | 0);
+	so_method(so, nv40->curie, NV40TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv40->curie, NV40TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv40->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv40->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+
+	so_emit(nv40->nvws, so);
+	so_ref (so, &nv40->so_framebuffer);
 }
 
 static void
@@ -670,9 +686,15 @@ nv40_set_polygon_stipple(struct pipe_context *pipe,
 			 const struct pipe_poly_stipple *stipple)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_stateobj *so = so_new(33, 0);
+	unsigned i;
 
-	BEGIN_RING(curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
-	OUT_RINGp ((uint32_t *)stipple->stipple, 32);
+	so_method(so, nv40->curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+	for (i = 0; i < 32; i++)
+		so_data(so, stipple->stipple[i]);
+
+	so_emit(nv40->nvws, so);
+	so_ref(NULL, &so);
 }
 
 static void
@@ -680,10 +702,14 @@ nv40_set_scissor_state(struct pipe_context *pipe,
 		       const struct pipe_scissor_state *s)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_stateobj *so = so_new(3, 0);
+
+	so_method(so, nv40->curie, NV40TCL_SCISSOR_HORIZ, 2);
+	so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+	so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
 
-	BEGIN_RING(curie, NV40TCL_SCISSOR_HORIZ, 2);
-	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
-	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);
+	so_emit(nv40->nvws, so);
+	so_ref(NULL, &so);
 }
 
 static void
@@ -691,16 +717,20 @@ nv40_set_viewport_state(struct pipe_context *pipe,
 			const struct pipe_viewport_state *vpt)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-
-	BEGIN_RING(curie, NV40TCL_VIEWPORT_TRANSLATE_X, 8);
-	OUT_RINGf (vpt->translate[0]);
-	OUT_RINGf (vpt->translate[1]);
-	OUT_RINGf (vpt->translate[2]);
-	OUT_RINGf (vpt->translate[3]);
-	OUT_RINGf (vpt->scale[0]);
-	OUT_RINGf (vpt->scale[1]);
-	OUT_RINGf (vpt->scale[2]);
-	OUT_RINGf (vpt->scale[3]);
+	struct nouveau_stateobj *so = so_new(9, 0);
+
+	so_method(so, nv40->curie, NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+	so_data  (so, fui(vpt->translate[0]));
+	so_data  (so, fui(vpt->translate[1]));
+	so_data  (so, fui(vpt->translate[2]));
+	so_data  (so, fui(vpt->translate[3]));
+	so_data  (so, fui(vpt->scale[0]));
+	so_data  (so, fui(vpt->scale[1]));
+	so_data  (so, fui(vpt->scale[2]));
+	so_data  (so, fui(vpt->scale[3]));
+
+	so_emit(nv40->nvws, so);
+	so_ref(NULL, &so);
 }
 
 static void
diff --git a/src/mesa/pipe/nv40/nv40_state.h b/src/mesa/pipe/nv40/nv40_state.h
index 24eea36db3..e82ab9de98 100644
--- a/src/mesa/pipe/nv40/nv40_state.h
+++ b/src/mesa/pipe/nv40/nv40_state.h
@@ -3,20 +3,6 @@
 
 #include "pipe/p_state.h"
 
-struct nv40_blend_state {
-	uint32_t b_enable;
-	uint32_t b_srcfunc;
-	uint32_t b_dstfunc;
-	uint32_t b_eqn;
-
-	uint32_t l_enable;
-	uint32_t l_op;
-
-	uint32_t c_mask;
-
-	uint32_t d_enable;
-};
-
 struct nv40_sampler_state {
 	uint32_t fmt;
 	uint32_t wrap;
@@ -25,29 +11,6 @@ struct nv40_sampler_state {
 	uint32_t bcol;
 };
 
-struct nv40_rasterizer_state {
-	uint32_t shade_model;
-
-	uint32_t line_width;
-	uint32_t line_smooth_en;
-	uint32_t line_stipple_en;
-	uint32_t line_stipple;
-
-	uint32_t point_size;
-
-	uint32_t poly_smooth_en;
-	uint32_t poly_stipple_en;
-	
-	uint32_t poly_mode_front;
-	uint32_t poly_mode_back;
-
-	uint32_t front_face;
-	uint32_t cull_face;
-	uint32_t cull_face_en;
-
-	uint32_t point_sprite;
-};
-
 struct nv40_vertex_program_exec {
 	uint32_t data[4];
 	boolean has_branch_offset;
@@ -99,36 +62,7 @@ struct nv40_fragment_program {
 	struct pipe_buffer *buffer;
 
 	uint32_t fp_control;
-};
-
-struct nv40_stencil_push {
-	uint32_t enable;
-	uint32_t wmask;
-	uint32_t func;
-	uint32_t ref;
-	uint32_t vmask;
-	uint32_t fail;
-	uint32_t zfail;
-	uint32_t zpass;
-};
-
-struct nv40_depth_stencil_alpha_state {
-	struct {
-		uint32_t func;
-		uint32_t write_enable;
-		uint32_t test_enable;
-	} depth;
-
-	struct {
-		struct nv40_stencil_push back;
-		struct nv40_stencil_push front;
-	} stencil;
-
-	struct {
-		uint32_t enabled;
-		uint32_t func;
-		uint32_t ref;
-	} alpha;
+	struct nouveau_stateobj *so;
 };
 
 struct nv40_miptree {
diff --git a/src/mesa/pipe/nv40/nv40_state_emit.c b/src/mesa/pipe/nv40/nv40_state_emit.c
index 66b98d5fab..3a22cd4bd5 100644
--- a/src/mesa/pipe/nv40/nv40_state_emit.c
+++ b/src/mesa/pipe/nv40/nv40_state_emit.c
@@ -11,77 +11,15 @@
 static void
 nv40_state_emit_dummy_relocs(struct nv40_context *nv40)
 {
-	unsigned rt_flags, tx_flags, fp_flags;
-	int i;	
+	unsigned i;	
 	
-	rt_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR | NOUVEAU_BO_DUMMY;
-	tx_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
-		   NOUVEAU_BO_DUMMY;
-	fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
-		   NOUVEAU_BO_DUMMY;
-
-	/* Render targets */
-	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-		OUT_RELOCm(nv40->rt[0], rt_flags,
-			   curie, NV40TCL_DMA_COLOR0, 1);
-		OUT_RELOCo(nv40->rt[0], rt_flags);
-		OUT_RELOCm(nv40->rt[0], rt_flags,
-			   curie, NV40TCL_COLOR0_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[0], 0, rt_flags);
-	}
-
-	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-		OUT_RELOCm(nv40->rt[1], rt_flags,
-			   curie, NV40TCL_DMA_COLOR1, 1);
-		OUT_RELOCo(nv40->rt[1], rt_flags);
-		OUT_RELOCm(nv40->rt[1], rt_flags,
-			   curie, NV40TCL_COLOR1_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[1], 0, rt_flags);
-	}
-
-	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
-		OUT_RELOCm(nv40->rt[2], rt_flags,
-			   curie, NV40TCL_DMA_COLOR2, 1);
-		OUT_RELOCo(nv40->rt[2], rt_flags);
-		OUT_RELOCm(nv40->rt[2], rt_flags,
-			   curie, NV40TCL_COLOR2_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[2], 0, rt_flags);
-	}
-
-	if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
-		OUT_RELOCm(nv40->rt[3], rt_flags,
-			   curie, NV40TCL_DMA_COLOR3, 1);
-		OUT_RELOCo(nv40->rt[3], rt_flags);
-		OUT_RELOCm(nv40->rt[3], rt_flags,
-			   curie, NV40TCL_COLOR3_OFFSET, 1);
-		OUT_RELOCl(nv40->rt[3], 0, rt_flags);
-	}
-
-	if (nv40->zeta) {
-		OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_DMA_ZETA, 1);
-		OUT_RELOCo(nv40->zeta, rt_flags);
-		OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv40->zeta, 0, rt_flags);
-	}
-
-	/* Texture images */
+	so_emit_reloc_markers(nv40->nvws, nv40->so_framebuffer);
 	for (i = 0; i < 16; i++) {
 		if (!(nv40->fp_samplers & (1 << i)))
 			continue;
-		OUT_RELOCm(nv40->tex[i].buffer, tx_flags,
-			   curie, NV40TCL_TEX_OFFSET(i), 2);
-		OUT_RELOCl(nv40->tex[i].buffer, 0, tx_flags);
-		OUT_RELOCd(nv40->tex[i].buffer, nv40->tex[i].format,
-			   tx_flags | NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
-			   NV40TCL_TEX_FORMAT_DMA1);
+		so_emit_reloc_markers(nv40->nvws, nv40->so_fragtex[i]);
 	}
-
-	/* Fragment program */
-	OUT_RELOCm(nv40->fragprog.active->buffer, fp_flags,
-		   curie, NV40TCL_FP_ADDRESS, 1);
-	OUT_RELOC (nv40->fragprog.active->buffer, 0,
-		   fp_flags | NOUVEAU_BO_OR | NOUVEAU_BO_LOW,
-		   NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_emit_reloc_markers(nv40->nvws, nv40->fragprog.active->so);
 }
 
 void
diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
index 0bb54c7610..e2cb3fda8f 100644
--- a/src/mesa/pipe/nv40/nv40_vbo.c
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -97,13 +97,13 @@ nv40_vbo_static_attrib(struct nv40_context *nv40, int attrib,
 }
 
 static void
-nv40_vbo_arrays_update(struct nv40_context *nv40)
+nv40_vbo_arrays_update(struct nv40_context *nv40, struct pipe_buffer *ib,
+		       unsigned ib_format)
 {
 	struct nv40_vertex_program *vp = nv40->vertprog.active;
-	uint32_t inputs, vtxfmt[16];
-	int hw, num_hw;
-
-	nv40->vb_enable = 0;
+	struct nouveau_stateobj *vtxbuf, *vtxfmt;
+	unsigned inputs, hw, num_hw;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 
 	inputs = vp->ir;
 	for (hw = 0; hw < 16 && inputs; hw++) {
@@ -114,73 +114,64 @@ nv40_vbo_arrays_update(struct nv40_context *nv40)
 	}
 	num_hw++;
 
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, nv40->curie, NV40TCL_VTXBUF_ADDRESS(0), num_hw);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, nv40->curie, NV40TCL_VTXFMT(0), num_hw);
+
 	inputs = vp->ir;
 	for (hw = 0; hw < num_hw; hw++) {
 		struct pipe_vertex_element *ve;
 		struct pipe_vertex_buffer *vb;
 
 		if (!(inputs & (1 << hw))) {
-			vtxfmt[hw] = NV40TCL_VTXFMT_TYPE_FLOAT;
+			so_data(vtxbuf, 0);
+			so_data(vtxfmt, NV40TCL_VTXFMT_TYPE_FLOAT);
 			continue;
 		}
 
 		ve = &nv40->vtxelt[hw];
 		vb = &nv40->vtxbuf[ve->vertex_buffer_index];
 
-		if (vb->pitch == 0) {
-			vtxfmt[hw] = NV40TCL_VTXFMT_TYPE_FLOAT;
-			if (nv40_vbo_static_attrib(nv40, hw, ve, vb) == TRUE)
-				continue;
+		if (!vb->pitch && nv40_vbo_static_attrib(nv40, hw, ve, vb)) {
+			so_data(vtxbuf, 0);
+			so_data(vtxfmt, NV40TCL_VTXFMT_TYPE_FLOAT);
+			continue;
 		}
 
-		nv40->vb_enable |= (1 << hw);
-		nv40->vb[hw].delta = vb->buffer_offset + ve->src_offset;
-		nv40->vb[hw].buffer = vb->buffer;
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->pitch << NV40TCL_VTXFMT_STRIDE_SHIFT) |
+				  (nv40_vbo_ncomp(ve->src_format) <<
+				   NV40TCL_VTXFMT_SIZE_SHIFT) |
+				  nv40_vbo_type(ve->src_format)));
+	}
 
-		vtxfmt[hw] = ((vb->pitch << NV40TCL_VTXFMT_STRIDE_SHIFT) |
-			      (nv40_vbo_ncomp(ve->src_format) <<
-			       NV40TCL_VTXFMT_SIZE_SHIFT) |
-			      nv40_vbo_type(ve->src_format));
+	if (ib) {
+		so_method(vtxbuf, nv40->curie, NV40TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
 	}
 
-	BEGIN_RING(curie, NV40TCL_VTXFMT(0), num_hw);
-	OUT_RINGp (vtxfmt, num_hw);
+	so_emit(nv40->nvws, vtxfmt);
+	so_emit(nv40->nvws, vtxbuf);
+	so_ref (vtxbuf, &nv40->so_vtxbuf);
+	so_ref (NULL, &vtxfmt);
 }
 
 static boolean
 nv40_vbo_validate_state(struct nv40_context *nv40,
 			struct pipe_buffer *ib, unsigned ib_format)
 {
-	unsigned inputs;
-
 	nv40_emit_hw_state(nv40);
-
-	if (nv40->dirty & NV40_NEW_ARRAYS) {
-		nv40_vbo_arrays_update(nv40);
+	if (nv40->dirty & NV40_NEW_ARRAYS || ib) {
+		nv40_vbo_arrays_update(nv40, ib, ib_format);
 		nv40->dirty &= ~NV40_NEW_ARRAYS;
 	}
 
-	inputs = nv40->vb_enable;
-	while (inputs) {
-		unsigned a = ffs(inputs) - 1;
-
-		inputs &= ~(1 << a);
-
-		BEGIN_RING(curie, NV40TCL_VTXBUF_ADDRESS(a), 1);
-		OUT_RELOC (nv40->vb[a].buffer, nv40->vb[a].delta,
-			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_LOW |
-			   NOUVEAU_BO_OR | NOUVEAU_BO_RD, 0,
-			   NV40TCL_VTXBUF_ADDRESS_DMA1);
-	}
-
-	if (ib) {
-		BEGIN_RING(curie, NV40TCL_IDXBUF_ADDRESS, 2);
-		OUT_RELOCl(ib, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
-			   NOUVEAU_BO_RD);
-		OUT_RELOCd(ib, ib_format, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
-			   NOUVEAU_BO_RD | NOUVEAU_BO_OR,
-			   0, NV40TCL_IDXBUF_FORMAT_DMA1);
-	}
+	so_emit_reloc_markers(nv40->nvws, nv40->so_vtxbuf);
 
 	BEGIN_RING(curie, 0x1710, 1);
 	OUT_RING  (0); /* vtx cache flush */
-- 
cgit v1.2.3


From d8642b830911825c30531a2e422fcd8d2d487f74 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 14:58:38 -0800
Subject: Vectorize all micro ops

Fold single instruction micro ops inline.  Remove unused micro ops.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 912 ++++++++++----------------------------
 src/mesa/pipe/cell/spu/spu_exec.h |   1 +
 2 files changed, 230 insertions(+), 683 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 168bada3bb..1ac9c031e3 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,8 +52,15 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
-#include <simdmath/sqrtf4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
 #include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
@@ -157,643 +164,175 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
 }
 
 
-static void
-micro_abs(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] = (float) fabs( (double) src->f[0] );
-   dst->f[1] = (float) fabs( (double) src->f[1] );
-   dst->f[2] = (float) fabs( (double) src->f[2] );
-   dst->f[3] = (float) fabs( (double) src->f[3] );
-}
-
-static void
-micro_add(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] + src1->f[0];
-   dst->f[1] = src0->f[1] + src1->f[1];
-   dst->f[2] = src0->f[2] + src1->f[2];
-   dst->f[3] = src0->f[3] + src1->f[3];
-}
-
-static void
-micro_iadd(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] + src1->i[0];
-   dst->i[1] = src0->i[1] + src1->i[1];
-   dst->i[2] = src0->i[2] + src1->i[2];
-   dst->i[3] = src0->i[3] + src1->i[3];
-}
-
-static void
-micro_and(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] & src1->u[0];
-   dst->u[1] = src0->u[1] & src1->u[1];
-   dst->u[2] = src0->u[2] & src1->u[2];
-   dst->u[3] = src0->u[3] & src1->u[3];
-}
-
-static void
-micro_ceil(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) ceil( (double) src->f[0] );
-   dst->f[1] = (float) ceil( (double) src->f[1] );
-   dst->f[2] = (float) ceil( (double) src->f[2] );
-   dst->f[3] = (float) ceil( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_cos(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) cos( (double) src->f[0] );
-   dst->f[1] = (float) cos( (double) src->f[1] );
-   dst->f[2] = (float) cos( (double) src->f[2] );
-   dst->f[3] = (float) cos( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_ddx(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_ddy(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_div(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] / src1->f[0];
-   dst->f[1] = src0->f[1] / src1->f[1];
-   dst->f[2] = src0->f[2] / src1->f[2];
-   dst->f[3] = src0->f[3] / src1->f[3];
-}
-
-static void
-micro_udiv(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] / src1->u[0];
-   dst->u[1] = src0->u[1] / src1->u[1];
-   dst->u[2] = src0->u[2] / src1->u[2];
-   dst->u[3] = src0->u[3] / src1->u[3];
-}
-
-static void
-micro_eq(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_ieq(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
-}
-
-static void
-micro_exp2(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src)
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
-   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
-   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
-   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
-#endif
-}
-
-static void
-micro_f2it(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->i[0] = (int) src->f[0];
-   dst->i[1] = (int) src->f[1];
-   dst->i[2] = (int) src->f[2];
-   dst->i[3] = (int) src->f[3];
-}
-
-static void
-micro_f2ut(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->u[0] = (uint) src->f[0];
-   dst->u[1] = (uint) src->f[1];
-   dst->u[2] = (uint) src->f[2];
-   dst->u[3] = (uint) src->f[3];
-}
-
-static void
-micro_flr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) floor( (double) src->f[0] );
-   dst->f[1] = (float) floor( (double) src->f[1] );
-   dst->f[2] = (float) floor( (double) src->f[2] );
-   dst->f[3] = (float) floor( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_frc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
-   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
-   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
-   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_ge(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_i2f(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] = (float) src->i[0];
-   dst->f[1] = (float) src->i[1];
-   dst->f[2] = (float) src->i[2];
-   dst->f[3] = (float) src->i[3];
-}
-
-static void
-micro_lg2(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
-   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
-   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
-   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
-#endif
-}
-
-static void
-micro_lt(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_abs(qword src)
 {
-   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+   return si_rotmi(si_shli(src, 1), -1);
 }
 
-static void
-micro_ilt(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_ceil(qword src)
 {
-   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+   return (qword) _ceilf4((vec_float4) src);
 }
 
-static void
-micro_ult(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_cos(qword src)
 {
-   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+   return (qword) _cosf4((vec_float4) src);
 }
 
-static void
-micro_max(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
 {
-   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
-}
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
 
-static void
-micro_imax(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+   return si_fs(bottom_right, bottom_left);
 }
 
-static void
-micro_umax(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_ddy(qword src)
 {
-   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
-}
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
 
-static void
-micro_min(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+   return si_fs(top_left, bottom_left);
 }
 
-static void
-micro_imin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_div(qword src0, qword src1)
 {
-   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
 }
 
-static void
-micro_umin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_flr(qword src)
 {
-   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+   return (qword) _floorf4((vec_float4) src);
 }
 
-static void
-micro_umod(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_frc(qword src)
 {
-   dst->u[0] = src0->u[0] % src1->u[0];
-   dst->u[1] = src0->u[1] % src1->u[1];
-   dst->u[2] = src0->u[2] % src1->u[2];
-   dst->u[3] = src0->u[3] % src1->u[3];
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
 }
 
-static void
-micro_mul(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] * src1->f[0];
-   dst->f[1] = src0->f[1] * src1->f[1];
-   dst->f[2] = src0->f[2] * src1->f[2];
-   dst->f[3] = src0->f[3] * src1->f[3];
-}
-
-static void
-micro_imul(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] * src1->i[0];
-   dst->i[1] = src0->i[1] * src1->i[1];
-   dst->i[2] = src0->i[2] * src1->i[2];
-   dst->i[3] = src0->i[3] * src1->i[3];
-}
-
-static void
-micro_imul64(
-   union spu_exec_channel *dst0,
-   union spu_exec_channel *dst1,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst1->i[0] = src0->i[0] * src1->i[0];
-   dst1->i[1] = src0->i[1] * src1->i[1];
-   dst1->i[2] = src0->i[2] * src1->i[2];
-   dst1->i[3] = src0->i[3] * src1->i[3];
-   dst0->i[0] = 0;
-   dst0->i[1] = 0;
-   dst0->i[2] = 0;
-   dst0->i[3] = 0;
-}
-
-static void
-micro_umul64(
-   union spu_exec_channel *dst0,
-   union spu_exec_channel *dst1,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst1->u[0] = src0->u[0] * src1->u[0];
-   dst1->u[1] = src0->u[1] * src1->u[1];
-   dst1->u[2] = src0->u[2] * src1->u[2];
-   dst1->u[3] = src0->u[3] * src1->u[3];
-   dst0->u[0] = 0;
-   dst0->u[1] = 0;
-   dst0->u[2] = 0;
-   dst0->u[3] = 0;
-}
-
-static void
-micro_movc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2 )
-{
-   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
-   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
-   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
-   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
-}
-
-static void
-micro_neg(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_ge(qword src0, qword src1)
 {
-   dst->f[0] = -src->f[0];
-   dst->f[1] = -src->f[1];
-   dst->f[2] = -src->f[2];
-   dst->f[3] = -src->f[3];
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
 }
 
-static void
-micro_ineg(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static qword
+micro_lg2(qword src)
 {
-   dst->i[0] = -src->i[0];
-   dst->i[1] = -src->i[1];
-   dst->i[2] = -src->i[2];
-   dst->i[3] = -src->i[3];
+   return (qword) _log2f4((vec_float4) src);
 }
 
-static void
-micro_not(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_lt(qword src0, qword src1)
 {
-   dst->u[0] = ~src->u[0];
-   dst->u[1] = ~src->u[1];
-   dst->u[2] = ~src->u[2];
-   dst->u[3] = ~src->u[3];
-}
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
 
-static void
-micro_or(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] | src1->u[0];
-   dst->u[1] = src0->u[1] | src1->u[1];
-   dst->u[2] = src0->u[2] | src1->u[2];
-   dst->u[3] = src0->u[3] | src1->u[3];
+   return si_xori(tmp, 0xff);
 }
 
-static void
-micro_pow(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_max(qword src0, qword src1)
 {
-   vec_float4 s0 = (vec_float4) {
-      src0->f[0], src0->f[1], src0->f[2], src0->f[3]
-   };
-   vec_float4 s1 = (vec_float4) {
-      src1->f[0], src1->f[1], src1->f[2], src1->f[3]
-   };
-   vec_float4 d = _powf4(s0, s1);
-
-   dst->f[0] = spu_extract(d, 0);
-   dst->f[1] = spu_extract(d, 1);
-   dst->f[2] = spu_extract(d, 2);
-   dst->f[3] = spu_extract(d, 3);
+   return si_selb(src1, src0, si_fcgt(src0, src1));
 }
 
-static void
-micro_rnd(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_min(qword src0, qword src1)
 {
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
-   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
-   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
-   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
-#endif
+   return si_selb(src0, src1, si_fcgt(src0, src1));
 }
 
-static void
-micro_shl(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_neg(qword src)
 {
-   dst->i[0] = src0->i[0] << src1->i[0];
-   dst->i[1] = src0->i[1] << src1->i[1];
-   dst->i[2] = src0->i[2] << src1->i[2];
-   dst->i[3] = src0->i[3] << src1->i[3];
+   return si_xor(src, (qword) spu_splats(0x80000000));
 }
 
-static void
-micro_ishr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_set_sign(qword src)
 {
-   dst->i[0] = src0->i[0] >> src1->i[0];
-   dst->i[1] = src0->i[1] >> src1->i[1];
-   dst->i[2] = src0->i[2] >> src1->i[2];
-   dst->i[3] = src0->i[3] >> src1->i[3];
+   return si_or(src, (qword) spu_splats(0x80000000));
 }
 
-static void
-micro_trunc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0 )
+static qword
+micro_pow(qword src0, qword src1)
 {
-   dst->f[0] = (float) (int) src0->f[0];
-   dst->f[1] = (float) (int) src0->f[1];
-   dst->f[2] = (float) (int) src0->f[2];
-   dst->f[3] = (float) (int) src0->f[3];
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
 }
 
-static void
-micro_ushr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_rnd(qword src)
 {
-   dst->u[0] = src0->u[0] >> src1->u[0];
-   dst->u[1] = src0->u[1] >> src1->u[1];
-   dst->u[2] = src0->u[2] >> src1->u[2];
-   dst->u[3] = src0->u[3] >> src1->u[3];
-}
+   const qword half = (qword) spu_splats(0.5f);
 
-static void
-micro_sin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) sin( (double) src->f[0] );
-   dst->f[1] = (float) sin( (double) src->f[1] );
-   dst->f[2] = (float) sin( (double) src->f[2] );
-   dst->f[3] = (float) sin( (double) src->f[3] );
-#endif
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
 }
 
-static void
-micro_sqrt( union spu_exec_channel *dst,
-            const union spu_exec_channel *src )
+static INLINE qword
+micro_ishr(qword src0, qword src1)
 {
-   vec_float4 s = (vec_float4) {
-      src->f[0], src->f[1], src->f[2], src->f[3]
-   };
-   vec_float4 d = _sqrtf4(s);
-
-   dst->f[0] = spu_extract(d, 0);
-   dst->f[1] = spu_extract(d, 1);
-   dst->f[2] = spu_extract(d, 2);
-   dst->f[3] = spu_extract(d, 3);
+   return si_rotma(src0, si_sfi(src1, 0));
 }
 
-static void
-micro_sub(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_trunc(qword src)
 {
-   dst->f[0] = src0->f[0] - src1->f[0];
-   dst->f[1] = src0->f[1] - src1->f[1];
-   dst->f[2] = src0->f[2] - src1->f[2];
-   dst->f[3] = src0->f[3] - src1->f[3];
+   return (qword) _truncf4((vec_float4) src);
 }
 
-static void
-micro_u2f(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static qword
+micro_sin(qword src)
 {
-   dst->f[0] = (float) src->u[0];
-   dst->f[1] = (float) src->u[1];
-   dst->f[2] = (float) src->u[2];
-   dst->f[3] = (float) src->u[3];
+   return (qword) _sinf4((vec_float4) src);
 }
 
-static void
-micro_xor(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_sqrt(qword src)
 {
-   dst->u[0] = src0->u[0] ^ src1->u[0];
-   dst->u[1] = src0->u[1] ^ src1->u[1];
-   dst->u[2] = src0->u[2] ^ src1->u[2];
-   dst->u[3] = src0->u[3] ^ src1->u[3];
+   return (qword) _sqrtf4((vec_float4) src);
 }
 
 static void
@@ -983,16 +522,15 @@ fetch_source(
 
    switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
    case TGSI_UTIL_SIGN_CLEAR:
-      micro_abs( chan, chan );
+      chan->q = micro_abs(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_SET:
-      micro_abs( chan, chan );
-      micro_neg( chan, chan );
+      chan->q = micro_set_sign(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_TOGGLE:
-      micro_neg( chan, chan );
+      chan->q = micro_neg(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_KEEP:
@@ -1000,7 +538,7 @@ fetch_source(
    }
 
    if (reg->SrcRegisterExtMod.Complement) {
-      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
    }
 }
 
@@ -1051,8 +589,8 @@ store_dest(
 
    case TGSI_SAT_ZERO_ONE:
       /* XXX need to obey ExecMask here */
-      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
@@ -1162,7 +700,7 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[1], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[1] );
+         r[0].q = micro_div(r[0].q, r[1].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1194,9 +732,9 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1228,9 +766,9 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1389,7 +927,7 @@ exec_instruction(
    case TGSI_OPCODE_ARL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 FETCH( &r[0], 0, chan_index );
-	 micro_f2it( &r[0], &r[0] );
+         r[0].q = si_cflts(r[0].q, 0);
 	 STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1409,22 +947,27 @@ exec_instruction(
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
 	 FETCH( &r[0], 0, CHAN_X );
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
 	    STORE( &r[0], 0, CHAN_Y );
 	 }
 
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-	    FETCH( &r[1], 0, CHAN_Y );
-	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-
-	    FETCH( &r[2], 0, CHAN_W );
-	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
-	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
-	    micro_pow( &r[1], &r[1], &r[2] );
-	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-	    STORE( &r[0], 0, CHAN_Z );
-	 }
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
       }
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
@@ -1435,7 +978,7 @@ exec_instruction(
    case TGSI_OPCODE_RCP:
    /* TGSI_OPCODE_RECIP */
       FETCH( &r[0], 0, CHAN_X );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
       }
@@ -1444,8 +987,8 @@ exec_instruction(
    case TGSI_OPCODE_RSQ:
    /* TGSI_OPCODE_RECIPSQRT */
       FETCH( &r[0], 0, CHAN_X );
-      micro_sqrt( &r[0], &r[0] );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
       }
@@ -1465,7 +1008,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         micro_mul( &r[0], &r[0], &r[1] );
+         r[0].q = si_fm(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1475,7 +1018,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
+         r[0].q = si_fa(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1484,17 +1027,16 @@ exec_instruction(
    /* TGSI_OPCODE_DOT3 */
       FETCH( &r[0], 0, CHAN_X );
       FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH( &r[1], 0, CHAN_Y );
       FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
 
       FETCH( &r[1], 0, CHAN_Z );
       FETCH( &r[2], 1, CHAN_Z );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
@@ -1506,25 +1048,22 @@ exec_instruction(
        FETCH(&r[0], 0, CHAN_X);
        FETCH(&r[1], 1, CHAN_X);
 
-       micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
        FETCH(&r[1], 0, CHAN_Y);
        FETCH(&r[2], 1, CHAN_Y);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
        FETCH(&r[1], 0, CHAN_Z);
        FETCH(&r[2], 1, CHAN_Z);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
        FETCH(&r[1], 0, CHAN_W);
        FETCH(&r[2], 1, CHAN_W);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1539,7 +1078,7 @@ exec_instruction(
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
 	 FETCH( &r[0], 0, CHAN_Y );
 	 FETCH( &r[1], 1, CHAN_Y);
-	 micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 	 STORE( &r[0], 0, CHAN_Y );
       }
 
@@ -1559,8 +1098,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         /* XXX use micro_min()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+         r[0].q = micro_min(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1571,8 +1109,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         /* XXX use micro_max()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+         r[0].q = micro_max(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index );
       }
@@ -1583,7 +1120,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1593,7 +1133,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         r[0].q = micro_ge(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1603,9 +1143,8 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_mul( &r[0], &r[0], &r[1] );
-         FETCH( &r[1], 2, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1615,7 +1154,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         micro_sub( &r[0], &r[0], &r[1] );
+         r[0].q = si_fs(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1628,9 +1167,8 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
 
-         micro_sub( &r[1], &r[1], &r[2] );
-         micro_mul( &r[0], &r[0], &r[1] );
-         micro_add( &r[0], &r[0], &r[2] );
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1661,7 +1199,7 @@ exec_instruction(
    /* TGSI_OPCODE_FRC */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_frc( &r[0], &r[0] );
+         r[0].q = micro_frc(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1674,7 +1212,7 @@ exec_instruction(
    /* TGSI_OPCODE_FLR */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_flr( &r[0], &r[0] );
+         r[0].q = micro_flr(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1682,7 +1220,7 @@ exec_instruction(
    case TGSI_OPCODE_ROUND:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_rnd( &r[0], &r[0] );
+         r[0].q = micro_rnd(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1691,7 +1229,7 @@ exec_instruction(
     /* TGSI_OPCODE_EX2 */
       FETCH(&r[0], 0, CHAN_X);
 
-      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1701,7 +1239,7 @@ exec_instruction(
    case TGSI_OPCODE_LOGBASE2:
    /* TGSI_OPCODE_LG2 */
       FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
+      r[0].q = micro_lg2(r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
       }
@@ -1712,7 +1250,7 @@ exec_instruction(
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 1, CHAN_X);
 
-      micro_pow( &r[0], &r[0], &r[1] );
+      r[0].q = micro_pow(r[0].q, r[1].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1723,35 +1261,34 @@ exec_instruction(
       /* TGSI_OPCODE_XPD */
       FETCH(&r[0], 0, CHAN_Y);
       FETCH(&r[1], 1, CHAN_Z);
-
-      micro_mul( &r[2], &r[0], &r[1] );
-
       FETCH(&r[3], 0, CHAN_Z);
       FETCH(&r[4], 1, CHAN_Y);
 
-      micro_mul( &r[5], &r[3], &r[4] );
-      micro_sub( &r[2], &r[2], &r[5] );
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
          STORE( &r[2], 0, CHAN_X );
       }
 
       FETCH(&r[2], 1, CHAN_X);
-
-      micro_mul( &r[3], &r[3], &r[2] );
-
       FETCH(&r[5], 0, CHAN_X);
 
-      micro_mul( &r[1], &r[1], &r[5] );
-      micro_sub( &r[3], &r[3], &r[1] );
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
          STORE( &r[3], 0, CHAN_Y );
       }
 
-      micro_mul( &r[5], &r[5], &r[4] );
-      micro_mul( &r[0], &r[0], &r[2] );
-      micro_sub( &r[5], &r[5], &r[0] );
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
          STORE( &r[5], 0, CHAN_Z );
@@ -1770,7 +1307,7 @@ exec_instruction(
        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH(&r[0], 0, chan_index);
 
-          micro_abs( &r[0], &r[0] );
+          r[0].q = micro_abs(r[0].q);
 
           STORE(&r[0], 0, chan_index);
        }
@@ -1784,23 +1321,21 @@ exec_instruction(
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 1, CHAN_X);
 
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 1, CHAN_Y);
 
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FETCH(&r[1], 0, CHAN_Z);
       FETCH(&r[2], 1, CHAN_Z);
 
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FETCH(&r[1], 1, CHAN_W);
 
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fa(r[0].q, r[1].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1810,7 +1345,7 @@ exec_instruction(
    case TGSI_OPCODE_COS:
       FETCH(&r[0], 0, CHAN_X);
 
-      micro_cos( &r[0], &r[0] );
+      r[0].q = micro_cos(r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1820,7 +1355,7 @@ exec_instruction(
    case TGSI_OPCODE_DDX:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddx( &r[0], &r[0] );
+         r[0].q = micro_ddx(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1828,7 +1363,7 @@ exec_instruction(
    case TGSI_OPCODE_DDY:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddy( &r[0], &r[0] );
+         r[0].q = micro_ddy(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1866,9 +1401,9 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1],
-                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
-                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1881,14 +1416,14 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
 
    case TGSI_OPCODE_SIN:
       FETCH( &r[0], 0, CHAN_X );
-      micro_sin( &r[0], &r[0] );
+      r[0].q = micro_sin(r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
       }
@@ -1898,7 +1433,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1907,7 +1445,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2038,7 +1579,11 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
 
-         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -2049,11 +1594,11 @@ exec_instruction(
          FETCH( &r[0], 0, CHAN_X );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-         micro_cos( &r[1], &r[0] );
+         r[1].q = micro_cos(r[0].q);
          STORE( &r[1], 0, CHAN_X );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         micro_sin( &r[1], &r[0] );
+         r[1].q = micro_sin(r[0].q);
          STORE( &r[1], 0, CHAN_Y );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
@@ -2075,12 +1620,11 @@ exec_instruction(
    case TGSI_OPCODE_DP2:
       FETCH( &r[0], 0, CHAN_X );
       FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH( &r[1], 0, CHAN_Y );
       FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
@@ -2152,7 +1696,7 @@ exec_instruction(
    case TGSI_OPCODE_CEIL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ceil( &r[0], &r[0] );
+         r[0].q = micro_ceil(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2160,7 +1704,7 @@ exec_instruction(
    case TGSI_OPCODE_I2F:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_i2f( &r[0], &r[0] );
+         r[0].q = si_csflt(r[0].q, 0);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2168,7 +1712,7 @@ exec_instruction(
    case TGSI_OPCODE_NOT:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_not( &r[0], &r[0] );
+         r[0].q = si_xorbi(r[0].q, 0xff);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2176,7 +1720,7 @@ exec_instruction(
    case TGSI_OPCODE_TRUNC:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_trunc( &r[0], &r[0] );
+         r[0].q = micro_trunc(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2185,7 +1729,9 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_shl( &r[0], &r[0], &r[1] );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2194,7 +1740,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ishr( &r[0], &r[0], &r[1] );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2203,7 +1749,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_and( &r[0], &r[0], &r[1] );
+         r[0].q = si_and(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2212,7 +1758,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_or( &r[0], &r[0], &r[1] );
+         r[0].q = si_or(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2225,7 +1771,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_xor( &r[0], &r[0], &r[1] );
+         r[0].q = si_xor(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h
index 89e422ba48..b4c7661ef6 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.h
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -43,6 +43,7 @@ union spu_exec_channel
    float    f[QUAD_SIZE];
    int      i[QUAD_SIZE];
    unsigned u[QUAD_SIZE];
+   qword    q;
 };
 
 /**
-- 
cgit v1.2.3


From 40147bd83556a41916892892193cc72d7977927a Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 17:12:20 -0800
Subject: Vectorize vertex puller

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 186 +++++++++--------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   4 +-
 2 files changed, 61 insertions(+), 129 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 5b0f2a6470..4133fbba17 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -42,6 +42,8 @@
 #define DRAW_DBG 0
 
 
+static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 };
+
 /**
  * Fetch a float[4] vertex attribute from memory, doing format/type
  * conversion as needed.
@@ -50,19 +52,16 @@
  * conversion, texture sampling etc.
  */
 #define FETCH_ATTRIB( NAME, SZ, CVT )			\
-static void						\
-fetch_##NAME(const void *ptr, float *attrib)		\
+static qword						\
+fetch_##NAME(const void *ptr)				\
 {							\
-   static const float defaults[4] = { 0,0,0,1 };	\
+   vec_float4 attrib = defaults;			\
    int i;						\
 							\
    for (i = 0; i < SZ; i++) {				\
-      attrib[i] = CVT;					\
-   }							\
-							\
-   for (; i < 4; i++) {					\
-      attrib[i] = defaults[i];				\
+      attrib = spu_insert(CVT, attrib, i);		\
    }							\
+   return (qword) attrib;				\
 }
 
 #define CVT_64_FLOAT   (float) ((double *) ptr)[i]
@@ -309,106 +308,59 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-static void 
-transpose_4x4( float *out, const float *in )
-{
-   /* This can be achieved in 12 sse instructions, plus the final
-    * stores I guess.  This is probably a bit more than that - maybe
-    * 32 or so?
-    */
-   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
-   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
-   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
-   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
-}
-
-
-
-static void fetch_xyz_rgb( struct spu_vs_context *draw,
-			   struct spu_exec_machine *machine,
-			   const unsigned *elts,
-			   unsigned count )
+void
+spu_transpose_4x4(qword *out, const qword *in)
 {
-   assert(count <= 4);
-
-//   _mesa_printf("%s\n", __FUNCTION__);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+   static const qword masks[8] = {
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-   }
-}
-
-
-
-
-static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
-			      struct spu_exec_machine *machine,
-			      const unsigned *elts,
-			      unsigned count )
-{
-   assert(count <= 4);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      },
+
+      { 
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+      },
+
+      { 
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+      },
+
+      { 
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+      },
+   };
 
-      {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+   out[0] = si_shufb(in[0], in[1], masks[0]);
+   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
 
-      {
-	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = 0.0f;
- 	 out[12] = 1.0f;
-      }
-   }
-}
+   out[1] = si_shufb(in[0], in[1], masks[2]);
+   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
 
+   out[2] = si_shufb(in[0], in[1], masks[4]);
+   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
 
+   out[3] = si_shufb(in[0], in[1], masks[6]);
+   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
+}
 
 
 /**
@@ -435,7 +387,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
-      float p[4][4];
+      qword p[4];
 
 
       /* Fetch four attributes for four vertices.  
@@ -452,17 +404,15 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memmove(& buffer, buffer + (addr & 0x0f), 16);
-
-         fetch(buffer, p[i]);
+         p[i] = (*fetch)(buffer + (addr & 0x0f));
       }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
-      
-      /* Transpose/swizzle into sse-friendly format.  Currently
+          p[i] = si_xor(p[i], p[i]);
+
+      /* Transpose/swizzle into vector-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
        * isn't true -- if the vertex shader only wants tex0.xy, we
        * could optimize for that.
@@ -471,7 +421,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
@@ -487,24 +437,4 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    }
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
-
-   /* Disable the fast path because they don't use mfc_get yet.
-    */
-#if 0
-   switch (draw->vertex_fetch.nr_attrs) {
-   case 2:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
-      break;
-   case 3:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
-      break;
-   default:
-      break;
-   }
-#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index b261ab44a2..2435b7ddae 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -6,7 +6,7 @@
 
 struct spu_vs_context;
 
-typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
+typedef qword (*spu_fetch_func)(const void *ptr);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
@@ -39,6 +39,8 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
+extern void spu_transpose_4x4(qword *out, const qword *in);
+
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3


From f33c8119abbe5980b793961298978dddc5b0563f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 17:14:09 -0800
Subject: More semi-trivial vectorization in the shader VM

---
 src/mesa/pipe/cell/spu/spu_exec.c | 62 +++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 35 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 1ac9c031e3..1bd8687d41 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -70,6 +70,7 @@
 #include "pipe/tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
 #include "spu_main.h"
+#include "spu_vertex_shader.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -144,23 +145,27 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
                       struct spu_sampler *samplers,
                       unsigned processor)
 {
+   qword zero;
+   qword not_zero;
    uint i;
 
    mach->Samplers = samplers;
    mach->Processor = processor;
    mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
 
+   zero = si_xor(zero, zero);
+   not_zero = si_xori(zero, 0xff);
+
    /* Setup constants. */
-   for( i = 0; i < 4; i++ ) {
-      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
-      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
-      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
-      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
-      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
-      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
-      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
-      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
-   }
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
 }
 
 
@@ -459,25 +464,16 @@ fetch_source(
          &index2,
          &indir_index );
 
-      index.i[0] += indir_index.i[0];
-      index.i[1] += indir_index.i[1];
-      index.i[2] += indir_index.i[2];
-      index.i[3] += indir_index.i[3];
+      index.q = si_a(index.q, indir_index.q);
    }
 
    if( reg->SrcRegister.Dimension ) {
       switch( reg->SrcRegister.File ) {
       case TGSI_FILE_INPUT:
-         index.i[0] *= 17;
-         index.i[1] *= 17;
-         index.i[2] *= 17;
-         index.i[3] *= 17;
+         index.q = si_mpyi(index.q, 17);
          break;
       case TGSI_FILE_CONSTANT:
-         index.i[0] *= 4096;
-         index.i[1] *= 4096;
-         index.i[2] *= 4096;
-         index.i[3] *= 4096;
+         index.q = si_shli(index.q, 12);
          break;
       default:
          assert( 0 );
@@ -505,10 +501,7 @@ fetch_source(
             &index2,
             &indir_index );
 
-         index.i[0] += indir_index.i[0];
-         index.i[1] += indir_index.i[1];
-         index.i[2] += indir_index.i[2];
-         index.i[3] += indir_index.i[3];
+         index.q = si_a(index.q, indir_index.q);
       }
    }
 
@@ -666,17 +659,16 @@ fetch_texel( struct spu_sampler *sampler,
              union spu_exec_channel *b,
              union spu_exec_channel *a )
 {
-   uint j;
-   float rgba[NUM_CHANNELS][QUAD_SIZE];
+   qword rgba[4];
+   qword out[4];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
 
-   for (j = 0; j < 4; j++) {
-      r->f[j] = rgba[0][j];
-      g->f[j] = rgba[1][j];
-      b->f[j] = rgba[2][j];
-      a->f[j] = rgba[3][j];
-   }
+   spu_transpose_4x4(out, rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
 }
 
 
-- 
cgit v1.2.3


From 76702d5fcd1d8341a099adfabef94a0f847ca06f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 4 Feb 2008 16:03:55 -0800
Subject: Add some debug messages

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 4133fbba17..cfa449e813 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -378,7 +378,10 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
    wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -401,6 +404,9 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-- 
cgit v1.2.3


From ac07631d85d0f1d30b1feba23f0f2f2c6549466d Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 5 Feb 2008 09:43:52 -0800
Subject: Use _transpose_matrix4x4 from Cell SDK instead of my own version

---
 src/mesa/pipe/cell/spu/spu_exec.c          |  3 +-
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 59 ++----------------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |  2 -
 3 files changed, 5 insertions(+), 59 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 1bd8687d41..e51008b9b3 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,6 +52,7 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
 #include <simdmath/ceilf4.h>
 #include <simdmath/cosf4.h>
 #include <simdmath/divf4.h>
@@ -664,7 +665,7 @@ fetch_texel( struct spu_sampler *sampler,
 
    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
 
-   spu_transpose_4x4(out, rgba);
+   _transpose_matrix4x4(out, rgba);
    r->q = out[0];
    g->q = out[1];
    b->q = out[2];
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index cfa449e813..6e86a919ce 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -31,6 +31,8 @@
   */
 
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -308,61 +310,6 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-void
-spu_transpose_4x4(qword *out, const qword *in)
-{
-   static const qword masks[8] = {
-      {
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-      },
-
-      { 
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-      },
-
-      { 
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-      },
-
-      { 
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-      },
-   };
-
-   out[0] = si_shufb(in[0], in[1], masks[0]);
-   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
-
-   out[1] = si_shufb(in[0], in[1], masks[2]);
-   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
-
-   out[2] = si_shufb(in[0], in[1], masks[4]);
-   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
-
-   out[3] = si_shufb(in[0], in[1], masks[6]);
-   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
-}
-
-
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
@@ -427,7 +374,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
+      _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index 2435b7ddae..c96b93ff0a 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -39,8 +39,6 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
-extern void spu_transpose_4x4(qword *out, const qword *in);
-
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3


From 48aad039398df0024126ff5892a62aca77b65547 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:21:01 -0700
Subject: Cell: added cell_batch_alloc_aligned()

---
 src/mesa/pipe/cell/ppu/cell_batch.c | 26 ++++++++++++++++++++------
 src/mesa/pipe/cell/ppu/cell_batch.h |  4 ++++
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 2fb49711b2..f45e5f25b6 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -157,7 +157,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
       size = 0;
    }
 
-   assert(size + bytes <= CELL_BUFFER_SIZE);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
 
@@ -167,14 +167,22 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 
 void *
 cell_batch_alloc(struct cell_context *cell, uint bytes)
+{
+   return cell_batch_alloc_aligned(cell, bytes, 1);
+}
+
+
+void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment)
 {
    void *pos;
-   uint size;
+   uint size, padbytes;
 
    ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
-
-   assert(cell->cur_batch >= 0);
+   ASSERT(alignment > 0);
+   ASSERT(cell->cur_batch >= 0);
 
 #ifdef ASSERT
    {
@@ -188,12 +196,18 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BUFFER_SIZE) {
+   padbytes = (alignment - (size % alignment)) % alignment;
+
+   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
+   else {
+      size += padbytes;
+   }
 
-   assert(size + bytes <= CELL_BUFFER_SIZE);
+   ASSERT(size % alignment == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    pos = (void *) (cell->buffer[cell->cur_batch] + size);
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index f4f37314a4..a6eee0a8b1 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -50,5 +50,9 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
 extern void *
 cell_batch_alloc(struct cell_context *cell, uint bytes);
 
+extern void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment);
+
 
 #endif /* CELL_BATCH_H */
-- 
cgit v1.2.3


From 2ec5ae5e504d8bf82cdecae8569dc12b8d62c055 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:21:48 -0700
Subject: Cell: remove dummy fields, update/add some comments

---
 src/mesa/pipe/cell/common.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index cf8fc94ebf..4de514c358 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -51,7 +51,7 @@
 
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
-   assert((((unsigned long) (ptr)) & 0xf) == 0);
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
 
 
 /** round up value to next multiple of 4 */
@@ -105,7 +105,7 @@
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;
+   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -117,7 +117,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint64_t opcode;
+   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
 };
@@ -128,8 +128,8 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;          /**< Base address of the 0th element. */
-    uint attr;          /**< Attribute that this state if for. */
+    uint64_t base;      /**< Base address of the 0th element. */
+    uint attr;          /**< Attribute that this state is for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
 } ALIGN16_ATTRIB;
@@ -169,11 +169,9 @@ struct cell_command_render
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
-   uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-   float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
-   uint dummy3;
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
 };
-- 
cgit v1.2.3


From df2ab198eb49333f01c8f10bea2033bea732d755 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:23:34 -0700
Subject: Cell: re-enable inlined vertex buffers

Vertex data must be on a 16-byte address/offset so SIMD operations will work
properly in the SPU code.
---
 src/mesa/pipe/cell/ppu/cell_vbuf.c  | 12 +++++-------
 src/mesa/pipe/cell/spu/spu_main.c   |  3 ++-
 src/mesa/pipe/cell/spu/spu_render.c | 12 ++++++++----
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 0fee61821a..e9fafe492e 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,7 +40,7 @@
 
 
 /** Allow vertex data to be inlined after RENDER command */
-#define ALLOW_INLINE_VERTS 0
+#define ALLOW_INLINE_VERTS 1
 
 
 /**
@@ -199,9 +199,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    {
       const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
-
-      const uint batch_size = sizeof(struct cell_command_render)
-         + index_bytes;
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
@@ -223,9 +221,9 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
           min_index == 0 &&
-          vertex_bytes <= cell_batch_free_space(cell)) {
-         /* vertex data inlined, after indices */
-         void *dst = cell_batch_alloc(cell, vertex_bytes);
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
          render->vertex_buf = ~0;
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 4f126d5e5b..e375197fe6 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -387,7 +387,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 8 + ((pos_incr + 1) / 2);
+            pos += pos_incr;
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -541,6 +541,7 @@ main(main_param_t speid, main_param_t argp)
    (void) speid;
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
 
    one_time_init();
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index e8705eeeba..932fb500b3 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -171,6 +171,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
@@ -199,13 +200,16 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
    /* indexes are right after the render command in the batch buffer */
    indexes = (const ushort *) (render + 1);
-   *pos_incr = (render->num_indexes * 2 + 3) / 4;
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
 
 
    if (render->inline_verts) {
-      /* Vertices are right after indexes in batch buffer */
-      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
    }
    else {
       /* Begin DMA fetch of vertex buffer */
-- 
cgit v1.2.3


From 14f1f2523b0186fe65f13b1981a782768d4f8376 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 15:07:36 -0700
Subject: Cell: SIMD-ize tri_linear_coeff(), use vector float for vertex
 attributes in struct vertex_header

---
 src/mesa/pipe/cell/spu/spu_tri.c | 112 ++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 37 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 688c8646ab..be9624cf7d 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -56,7 +56,7 @@ typedef union
  * Simplified types taken from other parts of Gallium
  */
 struct vertex_header {
-   float data[0][4];
+   vector float data[1];
 };
 
 
@@ -476,6 +476,7 @@ static void print_vertex(const struct vertex_header *v)
 }
 #endif
 
+
 static boolean setup_sort_vertices(const struct vertex_header *v0,
                                    const struct vertex_header *v1,
                                    const struct vertex_header *v2)
@@ -492,9 +493,9 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 
    /* determine bottom to top order of vertices */
    {
-      float y0 = v0->data[0][1];
-      float y1 = v1->data[0][1];
-      float y2 = v2->data[0][1];
+      float y0 = spu_extract(v0->data[0], 1);
+      float y1 = spu_extract(v1->data[0], 1);
+      float y2 = spu_extract(v2->data[0], 1);
       if (y0 <= y1) {
 	 if (y1 <= y2) {
 	    /* y0<=y1<=y2 */
@@ -538,25 +539,25 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
    }
 
    /* Check if triangle is completely outside the tile bounds */
-   if (setup.vmin->data[0][1] > setup.cliprect_maxy)
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
       return FALSE;
-   if (setup.vmax->data[0][1] < setup.cliprect_miny)
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
       return FALSE;
-   if (setup.vmin->data[0][0] < setup.cliprect_minx &&
-       setup.vmid->data[0][0] < setup.cliprect_minx &&
-       setup.vmax->data[0][0] < setup.cliprect_minx)
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
       return FALSE;
-   if (setup.vmin->data[0][0] > setup.cliprect_maxx &&
-       setup.vmid->data[0][0] > setup.cliprect_maxx &&
-       setup.vmax->data[0][0] > setup.cliprect_maxx)
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
       return FALSE;
 
-   setup.ebot.dx = setup.vmid->data[0][0] - setup.vmin->data[0][0];
-   setup.ebot.dy = setup.vmid->data[0][1] - setup.vmin->data[0][1];
-   setup.emaj.dx = setup.vmax->data[0][0] - setup.vmin->data[0][0];
-   setup.emaj.dy = setup.vmax->data[0][1] - setup.vmin->data[0][1];
-   setup.etop.dx = setup.vmax->data[0][0] - setup.vmid->data[0][0];
-   setup.etop.dy = setup.vmax->data[0][1] - setup.vmid->data[0][1];
+   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
+   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
@@ -597,14 +598,12 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
  * The result will be put into setup.coef[slot].a0.
  * \param slot  which attribute slot 
  */
-static INLINE void const_coeff(uint slot)
+static INLINE void
+const_coeff(uint slot)
 {
    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.f[0] = setup.vprovoke->data[slot][0];
-   setup.coef[slot].a0.f[1] = setup.vprovoke->data[slot][1];
-   setup.coef[slot].a0.f[2] = setup.vprovoke->data[slot][2];
-   setup.coef[slot].a0.f[3] = setup.vprovoke->data[slot][3];
+   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
 }
 
 
@@ -612,12 +611,19 @@ static INLINE void const_coeff(uint slot)
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
+static INLINE void
+tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
 {
    uint i;
+   const float *vmin_d = (float *) &setup.vmin->data[slot];
+   const float *vmid_d = (float *) &setup.vmid->data[slot];
+   const float *vmax_d = (float *) &setup.vmax->data[slot];
+   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
+   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+
    for (i = firstComp; i < lastComp; i++) {
-      float botda = setup.vmid->data[slot][i] - setup.vmin->data[slot][i];
-      float majda = setup.vmax->data[slot][i] - setup.vmin->data[slot][i];
+      float botda = vmid_d[i] - vmin_d[i];
+      float majda = vmax_d[i] - vmin_d[i];
       float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
       float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
    
@@ -638,9 +644,9 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup.coef[slot].a0.f[i] = (setup.vmin->data[slot][i] - 
-                                 (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-                                  setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
+                                 (setup.coef[slot].dadx.f[i] * x + 
+                                  setup.coef[slot].dady.f[i] * y));
    }
 
    /*
@@ -653,6 +659,37 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
 }
 
 
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
 #if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
@@ -710,17 +747,18 @@ static void setup_tri_coefficients(void)
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(i, 2, 3);
+         /*tri_linear_coeff(i, 2, 3);*/
          /* XXX interp W if PERSPECTIVE... */
+         tri_linear_coeff4(i);
          break;
       case INTERP_CONSTANT:
          const_coeff(i);
          break;
       case INTERP_LINEAR:
-         tri_linear_coeff(i, 0, 4);
+         tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff(i, 0, 4); /* XXX temporary */
+         tri_linear_coeff4(i);  /* temporary */
          break;
       default:
          ASSERT(0);
@@ -738,12 +776,12 @@ static void setup_tri_coefficients(void)
 
 static void setup_tri_edges(void)
 {
-   float vmin_x = setup.vmin->data[0][0] + 0.5f;
-   float vmid_x = setup.vmid->data[0][0] + 0.5f;
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 
-   float vmin_y = setup.vmin->data[0][1] - 0.5f;
-   float vmid_y = setup.vmid->data[0][1] - 0.5f;
-   float vmax_y = setup.vmax->data[0][1] - 0.5f;
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 
    setup.emaj.sy = CEILF(vmin_y);
    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
-- 
cgit v1.2.3


From d154f6a24b7c7265306d43fcb3b43dc759ad9bd2 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 15:12:18 -0700
Subject: Cell: remove accidentally added OPT_FLAGS lines

---
 src/mesa/pipe/cell/spu/Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 66f16cde9b..f202971d73 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -8,8 +8,6 @@ TOP = ../../../../..
 include $(TOP)/configs/linux-cell
 
 
-OPT_FLAGS=-g
-OPT_FLAGS=-O3
 PROG = g3d
 
 PROG_SPU = $(PROG)_spu
-- 
cgit v1.2.3


From 152ea0b42484c4173eb5eb4d8ecd6a79207b2310 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 4 Feb 2008 17:02:08 +0900
Subject: gallium: Portability guidelines.

---
 src/mesa/pipe/README.portability | 43 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 src/mesa/pipe/README.portability

diff --git a/src/mesa/pipe/README.portability b/src/mesa/pipe/README.portability
new file mode 100644
index 0000000000..c70ca774da
--- /dev/null
+++ b/src/mesa/pipe/README.portability
@@ -0,0 +1,43 @@
+	      CROSS-PLATFORM PORTABILITY GUIDELINES FOR GALLIUM3D 
+
+
+= General Considerations =
+
+The state tracker and winsys driver support a rather limited number of
+platforms. However, the pipe drivers are meant to run in a wide number of
+platforms. Hence the pipe drivers, the auxiliary modules, and all public
+headers in general, should stricly follow these guidelines to ensure
+
+
+= Compiler Support =
+
+* Include the p_compiler.h.
+
+* Don't use the 'inline' keyword, use the INLINE macro in p_compiler.h instead.
+
+* Cast explicitly when converting to integer types of smaller sizes.
+
+* Cast explicitly when converting between float, double and integral types.
+
+* Don't use named struct initializers.
+
+* Don't use variable number of macro arguments. Use static inline functions
+instead.
+
+
+= Standard Library =
+
+* Avoid including standard library headers. Most standard library functions are
+not available in Windows Kernel Mode. Use the appropriate p_*.h include.
+
+== Memory Allocation ==
+
+* Use MALLOC, CALLOC, FREE instead of the malloc, calloc, free functions.
+
+* Use align_pointer() function defined in p_util.h for aligning pointers in a
+portable way.
+
+== Debugging ==
+
+TODO
+
-- 
cgit v1.2.3


From 25c29080f772ea175b0582031d483ca79f70f8ac Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 6 Feb 2008 13:27:49 +0900
Subject: gallium: Cross-platform debugging helpers.

---
 src/mesa/SConscript          |  1 +
 src/mesa/pipe/p_debug.h      | 79 ++++++++++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/util/p_debug.c | 70 +++++++++++++++++++++++++++++++++++++++
 src/mesa/sources             |  1 +
 4 files changed, 151 insertions(+)
 create mode 100644 src/mesa/pipe/p_debug.h
 create mode 100644 src/mesa/pipe/util/p_debug.c

diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index 70a98f3129..faf8c84872 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -158,6 +158,7 @@ STATECACHE_SOURCES = [
 ]
 
 PIPEUTIL_SOURCES = [
+	'pipe/util/p_debug.c',
 	'pipe/util/p_tile.c',
 	'pipe/util/p_util.c',
 ]
diff --git a/src/mesa/pipe/p_debug.h b/src/mesa/pipe/p_debug.h
new file mode 100644
index 0000000000..b037eba2a3
--- /dev/null
+++ b/src/mesa/pipe/p_debug.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Cross-platform debugging helpers.
+ * 
+ * For now it just has assert and printf replacements, but it might be extended 
+ * with stack trace reports and more advanced logging in the near future. 
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef P_DEBUG_H_
+#define P_DEBUG_H_
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#ifdef DBG
+#ifndef DEBUG
+#define DEBUG 1
+#endif
+#else
+#ifndef NDEBUG
+#define NDEBUG 1
+#endif
+#endif
+
+
+void debug_printf(const char *format, ...);
+void debug_assert_fail(const char *expr, const char *file, unsigned line);
+
+/** Assert macro */
+#ifdef DEBUG
+#define debug_assert(expr) ((expr) ? (void)0 : debug_assert_fail(#expr, __FILE__, __LINE__))
+#else
+#define debug_assert(expr) ((void)0)
+#endif
+
+
+#ifdef assert
+#warning Standard C Library assert macro usage detected. 
+#undef assert
+#endif
+#define assert(expr) debug_assert(expr)
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* P_DEBUG_H_ */
diff --git a/src/mesa/pipe/util/p_debug.c b/src/mesa/pipe/util/p_debug.c
new file mode 100644
index 0000000000..faa093e57c
--- /dev/null
+++ b/src/mesa/pipe/util/p_debug.c
@@ -0,0 +1,70 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdarg.h>
+
+#ifdef WIN32
+#include <windows.h>
+#include <winddi.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_debug.h" 
+#include "pipe/p_compiler.h" 
+
+
+void debug_printf(const char *format, ...)
+{
+   va_list ap;
+   va_start( ap, format );  
+#ifdef WIN32
+   EngDebugPrint("Gallium3D: ", (PCHAR)format, ap);
+#else
+   vfprintf(stderr, format, ap);
+#endif
+   va_end( ap );
+}
+
+
+static INLINE debug_abort(void) 
+{
+#ifdef WIN32
+   EngDebugBreak();
+#else
+   abort();
+#endif
+}
+
+
+void debug_assert_fail(const char *expr, const char *file, unsigned line) 
+{
+   debug_printf("%s:%i: Assertion `%s' failed.");
+   debug_abort();
+}
diff --git a/src/mesa/sources b/src/mesa/sources
index e31d8cc466..c0087f76e6 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -195,6 +195,7 @@ STATECACHE_SOURCES = \
 	pipe/cso_cache/cso_cache.c
 
 PIPEUTIL_SOURCES = \
+	pipe/util/p_debug.c \
 	pipe/util/p_tile.c \
 	pipe/util/p_util.c
 
-- 
cgit v1.2.3


From d432583d69fbd68bef79f1bf2ab0976ea67ed0bc Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 6 Feb 2008 14:36:50 +0900
Subject: gallium: Update scons instructions. Propagate user environment.

---
 SConstruct | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/SConstruct b/SConstruct
index 01732b0c52..47f9b5389b 100644
--- a/SConstruct
+++ b/SConstruct
@@ -11,7 +11,7 @@ import sys
 #
 # For example, invoke scons as 
 #
-#   scons debug=1 dri=0 x86=1
+#   scons debug=1 dri=0 machine=x86
 #
 # to set configuration variables. Or you can write those options to a file
 # named config.py:
@@ -19,7 +19,7 @@ import sys
 #   # config.py
 #   debug=1
 #   dri=0
-#   x86=1
+#   machine='x86'
 # 
 # Invoke
 #
@@ -35,7 +35,9 @@ opts.Add(BoolOption('dri', 'build dri drivers', False))
 opts.Add(EnumOption('machine', 'use machine-specific assembly code', 'x86',
                      allowed_values=('generic', 'x86', 'x86-64')))
 
-env = Environment(options = opts)
+env = Environment(
+	options = opts, 
+	ENV = os.environ)
 Help(opts.GenerateHelpText(env))
 
 # for debugging
-- 
cgit v1.2.3


From 7ff0b6782a8fc24c4d8df2535fa070b10c416dfa Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 6 Feb 2008 14:37:24 +0900
Subject: gallium: Add forgotten return type.

---
 src/mesa/pipe/util/p_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/util/p_debug.c b/src/mesa/pipe/util/p_debug.c
index faa093e57c..9303c970cc 100644
--- a/src/mesa/pipe/util/p_debug.c
+++ b/src/mesa/pipe/util/p_debug.c
@@ -53,7 +53,7 @@ void debug_printf(const char *format, ...)
 }
 
 
-static INLINE debug_abort(void) 
+static INLINE void debug_abort(void) 
 {
 #ifdef WIN32
    EngDebugBreak();
-- 
cgit v1.2.3


From 560416b263d10dae5d235b4cdaf44699181da74a Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 6 Feb 2008 14:37:49 +0900
Subject: gallium: Use p_debug.h instead of non-portable stdio.h/assert.h
 functions.

---
 src/mesa/pipe/draw/draw_prim.c                  | 18 +++----
 src/mesa/pipe/draw/draw_vbuf.c                  | 63 ++++++++++++-------------
 src/mesa/pipe/draw/draw_vertex_fetch.c          |  8 ++--
 src/mesa/pipe/draw/draw_vertex_shader.c         |  6 +--
 src/mesa/pipe/draw/draw_vertex_shader_llvm.c    |  4 +-
 src/mesa/pipe/draw/draw_vf.c                    |  4 +-
 src/mesa/pipe/draw/draw_vf_generic.c            |  3 +-
 src/mesa/pipe/draw/draw_vf_sse.c                |  6 +--
 src/mesa/pipe/i915simple/i915_fpc_translate.c   | 14 +++---
 src/mesa/pipe/i915simple/i915_prim_vbuf.c       |  3 +-
 src/mesa/pipe/i915simple/i915_state_derived.c   |  2 +-
 src/mesa/pipe/i915simple/i915_state_emit.c      |  2 +-
 src/mesa/pipe/i915simple/i915_state_immediate.c |  2 +-
 src/mesa/pipe/i915simple/i915_state_sampler.c   |  2 +-
 src/mesa/pipe/i965simple/brw_cc.c               |  2 +-
 src/mesa/pipe/i965simple/brw_curbe.c            |  6 +--
 src/mesa/pipe/i965simple/brw_eu_debug.c         | 10 ++--
 src/mesa/pipe/i965simple/brw_eu_emit.c          |  4 +-
 src/mesa/pipe/i965simple/brw_sf.c               |  8 ++--
 src/mesa/pipe/i965simple/brw_sf_emit.c          | 14 +++---
 src/mesa/pipe/i965simple/brw_state.c            |  2 +-
 src/mesa/pipe/i965simple/brw_state_cache.c      |  6 +--
 src/mesa/pipe/i965simple/brw_state_pool.c       |  4 +-
 src/mesa/pipe/i965simple/brw_urb.c              |  6 +--
 src/mesa/pipe/i965simple/brw_vs_emit.c          |  2 +-
 src/mesa/pipe/i965simple/brw_wm.c               |  2 +-
 src/mesa/pipe/i965simple/brw_wm_glsl.c          |  2 +-
 src/mesa/pipe/i965simple/brw_wm_sampler_state.c |  2 +-
 src/mesa/pipe/p_compiler.h                      |  3 +-
 src/mesa/pipe/p_format.h                        |  3 ++
 src/mesa/pipe/p_util.h                          |  4 --
 src/mesa/pipe/pipebuffer/pb_buffer.h            |  4 +-
 src/mesa/pipe/pipebuffer/pb_buffer_fenced.c     |  4 +-
 src/mesa/pipe/pipebuffer/pb_buffer_fenced.h     |  2 +-
 src/mesa/pipe/pipebuffer/pb_buffer_malloc.c     |  4 +-
 src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c     |  4 +-
 src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c         | 21 ++++-----
 src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c       |  8 ++--
 src/mesa/pipe/softpipe/sp_prim_setup.c          | 24 +++++-----
 src/mesa/pipe/softpipe/sp_quad_fs.c             | 10 ++--
 src/mesa/pipe/softpipe/sp_tile_cache.c          |  4 +-
 src/mesa/pipe/tgsi/exec/tgsi_exec.c             |  6 +--
 src/mesa/pipe/tgsi/exec/tgsi_sse2.c             | 44 ++++++++---------
 src/mesa/pipe/tgsi/util/tgsi_build.c            |  1 +
 src/mesa/pipe/tgsi/util/tgsi_dump.c             |  3 ++
 src/mesa/pipe/tgsi/util/tgsi_parse.c            |  1 +
 src/mesa/pipe/tgsi/util/tgsi_util.c             |  1 +
 47 files changed, 176 insertions(+), 182 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 58400213d7..51e2242719 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -30,6 +30,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include "pipe/p_debug.h"
+
 #include "draw_private.h"
 #include "draw_context.h"
 
@@ -60,8 +62,8 @@ static void draw_prim_queue_flush( struct draw_context *draw )
    unsigned i;
 
    if (0)
-      fprintf(stdout,"Flushing with %d prims, %d verts\n",
-             draw->pq.queue_nr, draw->vs.queue_nr);
+      debug_printf("Flushing with %d prims, %d verts\n",
+                   draw->pq.queue_nr, draw->vs.queue_nr);
 
    assert (draw->pq.queue_nr != 0);
 
@@ -120,9 +122,9 @@ static void draw_prim_queue_flush( struct draw_context *draw )
 void draw_do_flush( struct draw_context *draw, unsigned flags )
 {
    if (0)
-      fprintf(stdout,"Flushing with %d verts, %d prims\n",
-	      draw->vs.queue_nr,
-	      draw->pq.queue_nr );
+      debug_printf("Flushing with %d verts, %d prims\n",
+                   draw->vs.queue_nr,
+                   draw->pq.queue_nr );
 
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
@@ -157,11 +159,11 @@ static struct prim_header *get_queued_prim( struct draw_context *draw,
 					    unsigned nr_verts )
 {
    if (!draw_vertex_cache_check_space( draw, nr_verts )) {
-//      fprintf(stderr, "v");
+//      debug_printf("v");
       draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
    }
    else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
-//      fprintf(stderr, "p");
+//      debug_printf("p");
       draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
    }
 
@@ -283,7 +285,7 @@ draw_prim( struct draw_context *draw,
    boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
 		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
 
-//   _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
+//   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
 
    switch (prim) {
    case PIPE_PRIM_POINTS:
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index ac03001d8f..be96c8fdeb 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -34,8 +34,7 @@
  */
 
 
-#include <assert.h>
-
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 
 #include "draw_vbuf.h"
@@ -125,55 +124,55 @@ dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
       j = vinfo->src_index[i];
       switch (vinfo->emit[i]) {
       case EMIT_OMIT:
-         fprintf(stderr, "EMIT_OMIT:");
+         debug_printf("EMIT_OMIT:");
          break;
       case EMIT_ALL:
          assert(i == 0);
          assert(j == 0);
-         fprintf(stderr, "EMIT_ALL:\t");
+         debug_printf("EMIT_ALL:\t");
          for(k = 0; k < vinfo->size*4; ++k)
-            fprintf(stderr, "%02x ", *data++);
+            debug_printf("%02x ", *data++);
          break;
       case EMIT_1F:
-         fprintf(stderr, "EMIT_1F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_1F_PSIZE:
-         fprintf(stderr, "EMIT_1F_PSIZE:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_2F:
-         fprintf(stderr, "EMIT_2F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_3F:
-         fprintf(stderr, "EMIT_3F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          data += sizeof(float);
          break;
       case EMIT_4F:
-         fprintf(stderr, "EMIT_4F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_4UB:
-         fprintf(stderr, "EMIT_4UB:\t");
-         fprintf(stderr, "%u ", *data++);
-         fprintf(stderr, "%u ", *data++);
-         fprintf(stderr, "%u ", *data++);
-         fprintf(stderr, "%u ", *data++);
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
          break;
       default:
          assert(0);
       }
-      fprintf(stderr, "\n");
+      debug_printf("\n");
    }
-   fprintf(stderr, "\n");
+   debug_printf("\n");
 }
 #endif
 
@@ -190,7 +189,7 @@ emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
 #if 0
-   fprintf(stderr, "emit vertex %d to %p\n", 
+   debug_printf("emit vertex %d to %p\n", 
            vbuf->nr_vertices, vbuf->vertex_ptr);
 #endif
 
@@ -198,7 +197,7 @@ emit_vertex( struct vbuf_stage *vbuf,
       if(vertex->vertex_id < vbuf->nr_vertices)
 	 return;
       else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	 debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n", 
 	         vertex->vertex_id, vbuf->nr_vertices);
       return;
    }
@@ -269,9 +268,9 @@ emit_vertex( struct vbuf_stage *vbuf,
 	 static float data[256]; 
 	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
 	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
-            fprintf(stderr, "With VF:\n");
+            debug_printf("With VF:\n");
             dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
-	    fprintf(stderr, "Without VF:\n");
+	    debug_printf("Without VF:\n");
 	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
 	    assert(0);
 	 }
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index b23f487e74..e13df04605 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -162,7 +162,7 @@ static fetch_func get_fetch_func( enum pipe_format format )
    {
       char tmp[80];
       pf_sprint_name(tmp, format);
-      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+      debug_printf("%s: %s\n", __FUNCTION__, tmp);
    }
 #endif
 
@@ -332,7 +332,7 @@ static void fetch_xyz_rgb( struct draw_context *draw,
 
    assert(count <= 4);
 
-//   _mesa_printf("%s\n", __FUNCTION__);
+//   debug_printf("%s\n", __FUNCTION__);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -421,7 +421,7 @@ static void generic_vertex_fetch( struct draw_context *draw,
 
    assert(count <= 4);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+//   debug_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -467,7 +467,7 @@ void draw_update_vertex_fetch( struct draw_context *draw )
 {
    unsigned nr_attrs, i;
 
-//   _mesa_printf("%s\n", __FUNCTION__);
+//   debug_printf("%s\n", __FUNCTION__);
    
    /* this may happend during context init */
    if (!draw->vertex_shader)
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index b851da845f..e6590eafcc 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -166,7 +166,7 @@ run_vertex_program(struct draw_context *draw,
       vOut[j]->data[0][3] = w;
 
 #if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
+      debug_printf("output[%d]win: %f %f %f %f\n", j,
              vOut[j]->data[0][0],
              vOut[j]->data[0][1],
              vOut[j]->data[0][2],
@@ -181,7 +181,7 @@ run_vertex_program(struct draw_context *draw,
          vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
          vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
 #if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+         debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot,
                 vOut[j]->data[slot][0],
                 vOut[j]->data[slot][1],
                 vOut[j]->data[slot][2],
@@ -207,7 +207,7 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
     */
    draw_update_vertex_fetch( draw );
 
-//   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
+//   debug_printf( " q(%d) ", draw->vs.queue_nr );
 #ifdef MESA_LLVM
    if (draw->vertex_shader->llvm_prog) {
       draw_vertex_shader_queue_flush_llvm(draw);
diff --git a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
index 4228c4f388..63551c993e 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
@@ -152,7 +152,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
       z = vOut->clip[2] = dests[0][2];
       w = vOut->clip[3] = dests[0][3];
 #if DBG
-      printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
+      debug_printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
 #endif
 
       vOut->clipmask = compute_clipmask(vOut->clip, draw->plane, draw->nr_planes);
@@ -179,7 +179,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
          vOut->data[slot][3] = dests[slot][3];
 
 #if DBG
-         printf("output %d: %f %f %f %f\n", slot,
+         debug_printf("output %d: %f %f %f %f\n", slot,
                 vOut->data[slot][0],
                 vOut->data[slot][1],
                 vOut->data[slot][2],
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 0da8e59ad6..f23d7fcec5 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -168,7 +168,7 @@ draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
       const unsigned format = map[i].format;
       if (format == DRAW_EMIT_PAD) {
 #if (DRAW_VF_DBG)
-	    _mesa_printf("%d: pad %d, offset %d\n", i,  
+	    debug_printf("%d: pad %d, offset %d\n", i,  
 			 map[i].offset, offset);  
 #endif
 
@@ -186,7 +186,7 @@ draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
 	 
 #if (DRAW_VF_DBG)
-	    _mesa_printf("%d: %s, offset %d\n", i,  
+	    debug_printf("%d: %s, offset %d\n", i,  
 			 draw_vf_format_info[format].name,
 			 vf->attr[j].vertoffset);   
 #endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 7f5f56ef9c..7a60a9db9c 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -27,9 +27,8 @@
  */
 
 
-#include <assert.h>
-
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 
 #include "draw_vf.h"
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 1389e6cfb9..1ad2ae756d 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -453,7 +453,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 else {
-	    fprintf(stderr, "Can't emit 1ub %x %x %d\n", 
+	    debug_printf("Can't emit 1ub %x %x %d\n", 
 	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
 	    return FALSE;
 	 }
@@ -499,7 +499,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    j++;		/* NOTE: two attrs consumed */
 	 }
 	 else {
-	    fprintf(stderr, "Can't emit 3ub\n");
+	    debug_printf("Can't emit 3ub\n");
 	 }
 	 return FALSE;	/* add this later */
 	 break;
@@ -532,7 +532,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       default:
-	 fprintf(stderr, "unknown a[%d].format %d\n", j, a->format);
+	 debug_printf("unknown a[%d].format %d\n", j, a->format);
 	 return FALSE;	/* catch any new opcodes */
       }
       
diff --git a/src/mesa/pipe/i915simple/i915_fpc_translate.c b/src/mesa/pipe/i915simple/i915_fpc_translate.c
index 0185512aeb..868f0c7e04 100644
--- a/src/mesa/pipe/i915simple/i915_fpc_translate.c
+++ b/src/mesa/pipe/i915simple/i915_fpc_translate.c
@@ -100,7 +100,7 @@ negate(int reg, int x, int y, int z, int w)
 static void
 i915_use_passthrough_shader(struct i915_context *i915)
 {
-   fprintf(stderr, "**** Using i915 pass-through fragment shader\n");
+   debug_printf("**** Using i915 pass-through fragment shader\n");
 
    i915->current.program = (uint *) MALLOC(sizeof(passthrough));
    if (i915->current.program) {
@@ -119,12 +119,12 @@ i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
    va_list args;
    char buffer[1024];
 
-   fprintf(stderr, "i915_program_error: ");
+   debug_printf("i915_program_error: ");
    va_start( args, msg );  
    vsprintf( buffer, msg, args );
    va_end( args );
-   fprintf(stderr, buffer);
-   fprintf(stderr, "\n");
+   debug_printf(buffer);
+   debug_printf("\n");
 
    p->error = 1;
 }
@@ -169,7 +169,7 @@ src_vector(struct i915_fp_compile *p,
 
       switch (sem_name) {
       case TGSI_SEMANTIC_POSITION:
-         fprintf(stderr, "SKIP SEM POS\n");
+         debug_printf("SKIP SEM POS\n");
          /*
          assert(p->wpos_tex != -1);
          src = i915_emit_decl(p, REG_TYPE_T, p->wpos_tex, D0_CHANNEL_ALL);
@@ -913,7 +913,7 @@ i915_translate_instructions(struct i915_fp_compile *p,
             ind = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
             sem = parse.FullToken.FullDeclaration.Semantic.SemanticName;
             semi = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-            /*printf("FS Input DECL [%u] sem %u\n", ind, sem);*/
+            /*debug_printf("FS Input DECL [%u] sem %u\n", ind, sem);*/
             p->input_semantic_name[ind] = sem;
             p->input_semantic_index[ind] = semi;
          }
@@ -924,7 +924,7 @@ i915_translate_instructions(struct i915_fp_compile *p,
             ind = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
             sem = parse.FullToken.FullDeclaration.Semantic.SemanticName;
             semi = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-            /*printf("FS Output DECL [%u] sem %u\n", ind, sem);*/
+            /*debug_printf("FS Output DECL [%u] sem %u\n", ind, sem);*/
             p->output_semantic_name[ind] = sem;
             p->output_semantic_index[ind] = semi;
          }
diff --git a/src/mesa/pipe/i915simple/i915_prim_vbuf.c b/src/mesa/pipe/i915simple/i915_prim_vbuf.c
index 39154b2488..e069773fd4 100644
--- a/src/mesa/pipe/i915simple/i915_prim_vbuf.c
+++ b/src/mesa/pipe/i915simple/i915_prim_vbuf.c
@@ -38,9 +38,8 @@
  */
 
 
-#include <assert.h>
-
 #include "pipe/draw/draw_vbuf.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
diff --git a/src/mesa/pipe/i915simple/i915_state_derived.c b/src/mesa/pipe/i915simple/i915_state_derived.c
index 62741e30f8..653983e4a9 100644
--- a/src/mesa/pipe/i915simple/i915_state_derived.c
+++ b/src/mesa/pipe/i915simple/i915_state_derived.c
@@ -87,7 +87,7 @@ static void calculate_vertex_layout( struct i915_context *i915 )
          }
          break;
       case TGSI_SEMANTIC_FOG:
-         fprintf(stderr, "i915 fogcoord not implemented yet\n");
+         debug_printf("i915 fogcoord not implemented yet\n");
          draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src++);
          break;
       default:
diff --git a/src/mesa/pipe/i915simple/i915_state_emit.c b/src/mesa/pipe/i915simple/i915_state_emit.c
index 657f523893..3339287f49 100644
--- a/src/mesa/pipe/i915simple/i915_state_emit.c
+++ b/src/mesa/pipe/i915simple/i915_state_emit.c
@@ -107,7 +107,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
                            ) * 3/2; /* plus 50% margin */
 
 #if 0
-   fprintf (stderr, "i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
+   debug_printf("i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
 #endif
    
    if(!BEGIN_BATCH(dwords, relocs)) {
diff --git a/src/mesa/pipe/i915simple/i915_state_immediate.c b/src/mesa/pipe/i915simple/i915_state_immediate.c
index 752d25f233..07031fc6c5 100644
--- a/src/mesa/pipe/i915simple/i915_state_immediate.c
+++ b/src/mesa/pipe/i915simple/i915_state_immediate.c
@@ -97,7 +97,7 @@ static void upload_S2S4(struct i915_context *i915)
       LIS2 = i915->current.vertex_info.hwfmt[1];
       LIS4 = i915->current.vertex_info.hwfmt[0];
       /*
-      printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
+      debug_printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
       */
       assert(LIS4); /* should never be zero? */
    }
diff --git a/src/mesa/pipe/i915simple/i915_state_sampler.c b/src/mesa/pipe/i915simple/i915_state_sampler.c
index 59408b6ba0..0dbbc5241d 100644
--- a/src/mesa/pipe/i915simple/i915_state_sampler.c
+++ b/src/mesa/pipe/i915simple/i915_state_sampler.c
@@ -169,7 +169,7 @@ translate_texture_format(enum pipe_format pipeFormat)
    case PIPE_FORMAT_S8Z24_UNORM:
       return (MAPSURF_32BIT | MT_32BIT_xL824);
    default:
-      fprintf(stderr, "i915: translate_texture_format() bad image format %x\n",
+      debug_printf("i915: translate_texture_format() bad image format %x\n",
               pipeFormat);
       assert(0);
       return 0;
diff --git a/src/mesa/pipe/i965simple/brw_cc.c b/src/mesa/pipe/i965simple/brw_cc.c
index dcee731895..337e4f95f6 100644
--- a/src/mesa/pipe/i965simple/brw_cc.c
+++ b/src/mesa/pipe/i965simple/brw_cc.c
@@ -58,7 +58,7 @@ static int brw_translate_compare_func(int func)
       return BRW_COMPAREFUNCTION_ALWAYS;
    }
 
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
    return BRW_COMPAREFUNCTION_ALWAYS;
 }
 
diff --git a/src/mesa/pipe/i965simple/brw_curbe.c b/src/mesa/pipe/i965simple/brw_curbe.c
index 2733eb4e75..52bbd525c1 100644
--- a/src/mesa/pipe/i965simple/brw_curbe.c
+++ b/src/mesa/pipe/i965simple/brw_curbe.c
@@ -273,10 +273,10 @@ static void upload_constant_buffer(struct brw_context *brw)
 
    if (1) {
       for (i = 0; i < sz; i+=4)
-	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 
-      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
 		   brw->curbe.last_buf, buf,
 		   bufsz, brw->curbe.last_bufsz,
 		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
@@ -299,7 +299,7 @@ static void upload_constant_buffer(struct brw_context *brw)
 			  bufsz,
 			  1 << 6,
 			  &brw->curbe.gs_offset)) {
-	 _mesa_printf("out of GS memory for curbe\n");
+	 debug_printf("out of GS memory for curbe\n");
 	 assert(0);
 	 return;
       }
diff --git a/src/mesa/pipe/i965simple/brw_eu_debug.c b/src/mesa/pipe/i965simple/brw_eu_debug.c
index be692f6502..4a94ddefa6 100644
--- a/src/mesa/pipe/i965simple/brw_eu_debug.c
+++ b/src/mesa/pipe/i965simple/brw_eu_debug.c
@@ -30,6 +30,8 @@
   */
     
 
+#include "pipe/p_debug.h"
+
 #include "brw_eu.h"
 
 void brw_print_reg( struct brw_reg hwreg )
@@ -52,7 +54,7 @@ void brw_print_reg( struct brw_reg hwreg )
       "f"
    };
 
-   _mesa_printf("%s%s", 
+   debug_printf("%s%s", 
 		hwreg.abs ? "abs/" : "",
 		hwreg.negate ? "-" : "");
      
@@ -63,17 +65,17 @@ void brw_print_reg( struct brw_reg hwreg )
        hwreg.width == BRW_WIDTH_8 &&
        hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
        hwreg.type == BRW_REGISTER_TYPE_F) {
-      _mesa_printf("vec%d", hwreg.nr);
+      debug_printf("vec%d", hwreg.nr);
    }
    else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
 	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
 	    hwreg.width == BRW_WIDTH_1 &&
 	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
 	    hwreg.type == BRW_REGISTER_TYPE_F) {      
-      _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
    }
    else {
-      _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
 		   file[hwreg.file],
 		   hwreg.nr,
 		   hwreg.subnr / type_sz(hwreg.type),
diff --git a/src/mesa/pipe/i965simple/brw_eu_emit.c b/src/mesa/pipe/i965simple/brw_eu_emit.c
index 2423536dd1..400a80b6fb 100644
--- a/src/mesa/pipe/i965simple/brw_eu_emit.c
+++ b/src/mesa/pipe/i965simple/brw_eu_emit.c
@@ -953,7 +953,7 @@ void brw_SAMPLE(struct brw_compile *p,
    boolean need_stall = 0;
 
    if(writemask == 0) {
-/*       _mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+/*       debug_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
 
@@ -985,7 +985,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-/* 	 _mesa_printf("need stall %x %x\n", newmask , writemask); */
+/* 	 debug_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
diff --git a/src/mesa/pipe/i965simple/brw_sf.c b/src/mesa/pipe/i965simple/brw_sf.c
index b89b2e4087..7c83b81c85 100644
--- a/src/mesa/pipe/i965simple/brw_sf.c
+++ b/src/mesa/pipe/i965simple/brw_sf.c
@@ -175,7 +175,7 @@ static void upload_sf_prog( struct brw_context *brw )
 	    //int semantic = parse.FullToken.FullDeclaration.Semantic.SemanticName;
 	    //int semantic_index = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
 
-	    fprintf(stderr, "fs input %d..%d interp mode %d\n", first, last, interp_mode);
+	    debug_printf("fs input %d..%d interp mode %d\n", first, last, interp_mode);
 	    
 	    switch (interp_mode) {
 	    case TGSI_INTERPOLATE_CONSTANT:
@@ -213,9 +213,9 @@ static void upload_sf_prog( struct brw_context *brw )
    key.linear_mask |= 1;
    key.const_mask <<= 1;
 
-   fprintf(stderr, "key.persp_mask: %x\n", key.persp_mask);
-   fprintf(stderr, "key.linear_mask: %x\n", key.linear_mask);
-   fprintf(stderr, "key.const_mask: %x\n", key.const_mask);
+   debug_printf("key.persp_mask: %x\n", key.persp_mask);
+   debug_printf("key.linear_mask: %x\n", key.linear_mask);
+   debug_printf("key.const_mask: %x\n", key.const_mask);
 
 
 //   key.do_point_sprite = brw->attribs.Point->PointSprite;
diff --git a/src/mesa/pipe/i965simple/brw_sf_emit.c b/src/mesa/pipe/i965simple/brw_sf_emit.c
index 6ff5254ff7..78d6fa5e9e 100644
--- a/src/mesa/pipe/i965simple/brw_sf_emit.c
+++ b/src/mesa/pipe/i965simple/brw_sf_emit.c
@@ -137,8 +137,8 @@ static boolean calculate_masks( struct brw_sf_compile *c,
    unsigned persp_mask = c->key.persp_mask;
    unsigned linear_mask = c->key.linear_mask;
 
-   fprintf(stderr, "persp_mask: %x\n", persp_mask);
-   fprintf(stderr, "linear_mask: %x\n", linear_mask);
+   debug_printf("persp_mask: %x\n", persp_mask);
+   debug_printf("linear_mask: %x\n", linear_mask);
 
    *pc_persp = 0;
    *pc_linear = 0;
@@ -162,9 +162,9 @@ static boolean calculate_masks( struct brw_sf_compile *c,
 	 *pc_linear |= 0xf0;
    }
 
-   fprintf(stderr, "pc: %x\n", *pc);
-   fprintf(stderr, "pc_persp: %x\n", *pc_persp);
-   fprintf(stderr, "pc_linear: %x\n", *pc_linear);
+   debug_printf("pc: %x\n", *pc);
+   debug_printf("pc_persp: %x\n", *pc_persp);
+   debug_printf("pc_linear: %x\n", *pc_linear);
    
 
    return is_last_attr;
@@ -177,7 +177,7 @@ void brw_emit_tri_setup( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    unsigned i;
 
-   fprintf(stderr, "%s START ==============\n", __FUNCTION__);
+   debug_printf("%s START ==============\n", __FUNCTION__);
 
    c->nr_verts = 3;
    alloc_regs(c);
@@ -250,7 +250,7 @@ void brw_emit_tri_setup( struct brw_sf_compile *c )
       }
    }
 
-   fprintf(stderr, "%s DONE ==============\n", __FUNCTION__);
+   debug_printf("%s DONE ==============\n", __FUNCTION__);
 
 }
 
diff --git a/src/mesa/pipe/i965simple/brw_state.c b/src/mesa/pipe/i965simple/brw_state.c
index daf14ff4ff..95dfce88e4 100644
--- a/src/mesa/pipe/i965simple/brw_state.c
+++ b/src/mesa/pipe/i965simple/brw_state.c
@@ -225,7 +225,7 @@ static void brw_bind_vs_state(struct pipe_context *pipe, void *vs)
    brw->attribs.VertexProgram = (struct brw_vertex_program *)vs;
    brw->state.dirty.brw |= BRW_NEW_VS;
 
-   printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
+   debug_printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
 }
 
 static void brw_delete_vs_state(struct pipe_context *pipe, void *shader)
diff --git a/src/mesa/pipe/i965simple/brw_state_cache.c b/src/mesa/pipe/i965simple/brw_state_cache.c
index c5738733f4..b3a5124461 100644
--- a/src/mesa/pipe/i965simple/brw_state_cache.c
+++ b/src/mesa/pipe/i965simple/brw_state_cache.c
@@ -149,7 +149,7 @@ unsigned brw_upload_cache( struct brw_cache *cache,
    if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
       /* Should not be possible:
        */
-      printf("brw_pool_alloc failed\n");
+      debug_printf("brw_pool_alloc failed\n");
       exit(1);
    }
 
@@ -177,7 +177,7 @@ unsigned brw_upload_cache( struct brw_cache *cache,
    }
 
    if (BRW_DEBUG & DEBUG_STATE)
-      printf("upload %s: %d bytes to pool buffer %p offset %x\n",
+      debug_printf("upload %s: %d bytes to pool buffer %p offset %x\n",
              cache->name, 
 	     data_size,
              (void*)cache->pool->buffer,
@@ -416,7 +416,7 @@ void brw_clear_all_caches( struct brw_context *brw )
    int i;
 
    if (BRW_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+      debug_printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < BRW_MAX_CACHE; i++)
       clear_cache(&brw->cache[i]);
diff --git a/src/mesa/pipe/i965simple/brw_state_pool.c b/src/mesa/pipe/i965simple/brw_state_pool.c
index 7c67f0ee25..f3174bfe0a 100644
--- a/src/mesa/pipe/i965simple/brw_state_pool.c
+++ b/src/mesa/pipe/i965simple/brw_state_pool.c
@@ -58,7 +58,7 @@ boolean brw_pool_alloc( struct brw_mem_pool *pool,
    size = align(size, 4);
 
    if (pool->offset + fixup + size >= pool->size) {
-      printf("%s failed\n", __FUNCTION__);
+      debug_printf("%s failed\n", __FUNCTION__);
       assert(0);
       exit(0);
    }
@@ -74,7 +74,7 @@ static
 void brw_invalidate_pool( struct brw_mem_pool *pool )
 {
    if (BRW_DEBUG & DEBUG_STATE)
-      printf("\n\n\n %s \n\n\n", __FUNCTION__);
+      debug_printf("\n\n\n %s \n\n\n", __FUNCTION__);
 
    pool->offset = 0;
 
diff --git a/src/mesa/pipe/i965simple/brw_urb.c b/src/mesa/pipe/i965simple/brw_urb.c
index b284526aa6..101a4367b9 100644
--- a/src/mesa/pipe/i965simple/brw_urb.c
+++ b/src/mesa/pipe/i965simple/brw_urb.c
@@ -120,18 +120,18 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	     * entries and the values for minimum nr of entries
 	     * provided above.
 	     */
-	    fprintf(stderr, "couldn't calculate URB layout!\n");
+	    debug_printf("couldn't calculate URB layout!\n");
 	    exit(1);
 	 }
 
 	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
-	    printf("URB CONSTRAINED\n");
+	    debug_printf("URB CONSTRAINED\n");
       }
       else
 	 brw->urb.constrained = 0;
 
       if (BRW_DEBUG & DEBUG_URB)
-	 printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
 		      brw->urb.gs_start,
 		      brw->urb.clip_start,
diff --git a/src/mesa/pipe/i965simple/brw_vs_emit.c b/src/mesa/pipe/i965simple/brw_vs_emit.c
index b32c233dd2..98915ba101 100644
--- a/src/mesa/pipe/i965simple/brw_vs_emit.c
+++ b/src/mesa/pipe/i965simple/brw_vs_emit.c
@@ -1228,7 +1228,7 @@ static void process_instruction(struct brw_vs_compile *c,
    case TGSI_OPCODE_ENDSUB:
       break;
    default:
-      printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
+      debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
       break;
    }
 
diff --git a/src/mesa/pipe/i965simple/brw_wm.c b/src/mesa/pipe/i965simple/brw_wm.c
index 0ee0fbed51..539b170744 100644
--- a/src/mesa/pipe/i965simple/brw_wm.c
+++ b/src/mesa/pipe/i965simple/brw_wm.c
@@ -57,7 +57,7 @@ static void do_wm_prog( struct brw_context *brw,
    c->pixel_w = brw_null_reg();
 
 
-   fprintf(stderr, "XXXXXXXX FP\n");
+   debug_printf("XXXXXXXX FP\n");
    
    brw_wm_glsl_emit(c);
 
diff --git a/src/mesa/pipe/i965simple/brw_wm_glsl.c b/src/mesa/pipe/i965simple/brw_wm_glsl.c
index f4b5c13c06..d95645d108 100644
--- a/src/mesa/pipe/i965simple/brw_wm_glsl.c
+++ b/src/mesa/pipe/i965simple/brw_wm_glsl.c
@@ -982,7 +982,7 @@ static void brw_wm_emit_instruction( struct brw_wm_compile *c,
       break;
 
    default:
-      _mesa_printf("unsupported IR in fragment shader %d\n",
+      debug_printf("unsupported IR in fragment shader %d\n",
 		   inst->Instruction.Opcode);
    }
 #if 0
diff --git a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
index cfb430eb09..de42ffc5b1 100644
--- a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
+++ b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
@@ -71,7 +71,7 @@ static int intel_translate_shadow_compare_func(unsigned func)
        return COMPAREFUNC_NEVER;
    }
 
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
    return COMPAREFUNC_NEVER;
 }
 
diff --git a/src/mesa/pipe/p_compiler.h b/src/mesa/pipe/p_compiler.h
index e939d9cd9b..30cd729c56 100644
--- a/src/mesa/pipe/p_compiler.h
+++ b/src/mesa/pipe/p_compiler.h
@@ -28,10 +28,9 @@
 #ifndef P_COMPILER_H
 #define P_COMPILER_H
 
-#include <assert.h>
+
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 
 
 #if defined(_WIN32) && !defined(__WIN32__)
diff --git a/src/mesa/pipe/p_format.h b/src/mesa/pipe/p_format.h
index 9f60cdbb04..c9ad324315 100644
--- a/src/mesa/pipe/p_format.h
+++ b/src/mesa/pipe/p_format.h
@@ -28,7 +28,10 @@
 #ifndef PIPE_FORMAT_H
 #define PIPE_FORMAT_H
 
+#include <stdio.h> // for sprintf
+
 #include "p_compiler.h"
+#include "p_debug.h"
 
 /**
  * The PIPE_FORMAT is a 32-bit wide bitfield that encodes all the information
diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index 059528787d..4780ed7818 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -381,10 +381,6 @@ static INLINE int align(int value, int alignment)
    return (value + alignment - 1) & ~(alignment - 1);
 }
 
-/* Convenient...
- */
-extern void _mesa_printf(const char *str, ...);
-
 
 /* util/p_util.c
  */
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer.h b/src/mesa/pipe/pipebuffer/pb_buffer.h
index 17551b3b50..97beb5f72a 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer.h
+++ b/src/mesa/pipe/pipebuffer/pb_buffer.h
@@ -44,10 +44,8 @@
 #define PB_BUFFER_H_
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_state.h"
 #include "pipe/p_inlines.h"
 
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
index 4cf4222db9..f4fc3f6d71 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
@@ -34,12 +34,10 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "linked_list.h"
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include "p_winsys.h"
 #include "p_thread.h"
 #include "p_util.h"
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
index 09082a5390..c40b9c75e1 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
@@ -51,7 +51,7 @@
 #define PB_BUFFER_FENCED_H_
 
 
-#include <assert.h>
+#include "pipe/p_debug.h"
 
 
 struct pipe_winsys;
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
index 2151f1d691..c1b7759874 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
@@ -34,9 +34,7 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pb_buffer.h"
 
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
index 3b341c64c2..c535d3276c 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
@@ -34,9 +34,7 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
+#include "p_debug.h"
 #include "p_util.h"
 
 #include "pb_buffer.h"
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
index b6af7cdedc..8b1b51c0e2 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
@@ -34,11 +34,10 @@
  */
 
 
-#include <assert.h>
-
 #include "linked_list.h"
 
 #include "p_defines.h"
+#include "p_debug.h"
 #include "p_thread.h"
 #include "p_util.h"
 #include "pb_buffer.h"
@@ -69,28 +68,28 @@ struct mem_block
 static void
 mmDumpMemInfo(const struct mem_block *heap)
 {
-   fprintf(stderr, "Memory heap %p:\n", (void *)heap);
+   debug_printf("Memory heap %p:\n", (void *)heap);
    if (heap == 0) {
-      fprintf(stderr, "  heap == 0\n");
+      debug_printf("  heap == 0\n");
    } else {
       const struct mem_block *p;
 
       for(p = heap->next; p != heap; p = p->next) {
-	 fprintf(stderr, "  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
 		 p->free ? 'F':'.',
 		 p->reserved ? 'R':'.');
       }
 
-      fprintf(stderr, "\nFree list:\n");
+      debug_printf("\nFree list:\n");
 
       for(p = heap->next_free; p != heap; p = p->next_free) {
-	 fprintf(stderr, " FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
 		 p->free ? 'F':'.',
 		 p->reserved ? 'R':'.');
       }
 
    }
-   fprintf(stderr, "End of memory blocks\n");
+   debug_printf("End of memory blocks\n");
 }
 #endif
 
@@ -308,11 +307,11 @@ mmFreeMem(struct mem_block *b)
       return 0;
 
    if (b->free) {
-      fprintf(stderr, "block already free\n");
+      debug_printf("block already free\n");
       return -1;
    }
    if (b->reserved) {
-      fprintf(stderr, "block is reserved\n");
+      debug_printf("block is reserved\n");
       return -1;
    }
 
@@ -479,7 +478,7 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    
    mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
    if(!mm_buf->block) {
-      fprintf(stderr, "warning: heap full\n");
+      debug_printf("warning: heap full\n");
 #if 0
       mmDumpMemInfo(mm->heap);
 #endif
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
index f80c7e34c0..bcd4b3e257 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
@@ -35,12 +35,10 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "linked_list.h"
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include "p_thread.h"
 #include "p_defines.h"
 #include "p_util.h"
@@ -178,7 +176,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
 
    if (pool->numFree == 0) {
       _glthread_UNLOCK_MUTEX(pool->mutex);
-      fprintf(stderr, "warning: out of fixed size buffer objects\n");
+      debug_printf("warning: out of fixed size buffer objects\n");
       return NULL;
    }
 
@@ -186,7 +184,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
 
    if (item == &pool->free) {
       _glthread_UNLOCK_MUTEX(pool->mutex);
-      fprintf(stderr, "error: fixed size buffer pool corruption\n");
+      debug_printf("error: fixed size buffer pool corruption\n");
       return NULL;
    }
 
diff --git a/src/mesa/pipe/softpipe/sp_prim_setup.c b/src/mesa/pipe/softpipe/sp_prim_setup.c
index b17801d13d..7478b2336b 100644
--- a/src/mesa/pipe/softpipe/sp_prim_setup.c
+++ b/src/mesa/pipe/softpipe/sp_prim_setup.c
@@ -251,9 +251,9 @@ static void print_vertex(const struct setup_stage *setup,
                          const struct vertex_header *v)
 {
    int i;
-   fprintf(stderr, "Vertex: (%p)\n", v);
+   debug_printf("Vertex: (%p)\n", v);
    for (i = 0; i < setup->quad.nr_attrs; i++) {
-      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
+      debug_printf("  %d: %f %f %f %f\n",  i, 
               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
    }
 }
@@ -267,7 +267,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
    const struct vertex_header *v2 = prim->v[2];
 
 #if DEBUG_VERTS
-   fprintf(stderr, "Triangle:\n");
+   debug_printf("Triangle:\n");
    print_vertex(setup, v0);
    print_vertex(setup, v1);
    print_vertex(setup, v2);
@@ -345,7 +345,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 
       setup->oneoverarea = 1.0f / area;
       /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
+      debug_printf("%s one-over-area %f  area %f  det %f\n",
                    __FUNCTION__, setup->oneoverarea, area, prim->det );
       */
    }
@@ -419,7 +419,7 @@ static void tri_linear_coeff( struct setup_stage *setup,
                    dady * (setup->vmin->data[0][1] - 0.5f)));
 
    /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
 		setup->coef[slot].a0[i],
 		setup->coef[slot].dadx[i],
@@ -453,10 +453,10 @@ static void tri_persp_coeff( struct setup_stage *setup,
    float dady = b * setup->oneoverarea;
       
    /*
-   printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-          setup->vmin->data[vertSlot][i],
-          setup->vmid->data[vertSlot][i],
-          setup->vmax->data[vertSlot][i]
+   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+          	setup->vmin->data[vertSlot][i],
+          	setup->vmid->data[vertSlot][i],
+       		setup->vmax->data[vertSlot][i]
           );
    */
    assert(i <= 3);
@@ -619,7 +619,7 @@ static void subtriangle( struct setup_stage *setup,
    finish_y -= sy;
 
    /*
-   _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
    */
 
    for (y = start_y; y < finish_y; y++) {
@@ -671,7 +671,7 @@ static void setup_tri( struct draw_stage *stage,
    struct setup_stage *setup = setup_stage( stage );
 
    /*
-   _mesa_printf("%s\n", __FUNCTION__ );
+   debug_printf("%s\n", __FUNCTION__ );
    */
 
    setup_sort_vertices( setup, prim );
@@ -1124,7 +1124,7 @@ setup_point(struct draw_stage *stage, struct prim_header *prim)
          int ix, iy;
 
          /*
-         printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
+         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
          */
          for (iy = iymin; iy <= iymax; iy += 2) {
             uint rowMask = 0xf;
diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c
index 90691c6065..b5d7dfca1c 100644
--- a/src/mesa/pipe/softpipe/sp_quad_fs.c
+++ b/src/mesa/pipe/softpipe/sp_quad_fs.c
@@ -223,13 +223,13 @@ shade_quad_llvm(struct quad_stage *qs,
    inputs[2][0][1] = fy + 1.0f;
    inputs[3][0][1] = fy + 1.0f;
 #if DLLVM
-   printf("MASK = %d\n", quad->mask);
+   debug_printf("MASK = %d\n", quad->mask);
 #endif
    gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
 #if DLLVM
    for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 2; ++j) {
-         printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
+         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
                 inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
       }
    }
@@ -240,7 +240,7 @@ shade_quad_llvm(struct quad_stage *qs,
                                    softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
                                    qss->samplers);
 #if DLLVM
-   printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
+   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
           dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
           dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
 #endif
@@ -260,7 +260,7 @@ shade_quad_llvm(struct quad_stage *qs,
    }
 #if DLLVM
    for (int i = 0; i < QUAD_SIZE; ++i) {
-      printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
+      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
              quad->outputs.color[0][i],
              quad->outputs.color[1][i],
              quad->outputs.color[2][i],
@@ -284,7 +284,7 @@ shade_quad_llvm(struct quad_stage *qs,
       }
    }
 #if DLLVM
-   printf("D [%f, %f, %f, %f] mask = %d\n",
+   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
              quad->outputs.depth[0],
              quad->outputs.depth[1],
              quad->outputs.depth[2],
diff --git a/src/mesa/pipe/softpipe/sp_tile_cache.c b/src/mesa/pipe/softpipe/sp_tile_cache.c
index ccf367a5e4..1597361b82 100644
--- a/src/mesa/pipe/softpipe/sp_tile_cache.c
+++ b/src/mesa/pipe/softpipe/sp_tile_cache.c
@@ -341,7 +341,7 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
       }
    }
 #if 0
-   printf("num cleared: %u\n", numCleared);
+   debug_printf("num cleared: %u\n", numCleared);
 #endif
 }
 
@@ -384,7 +384,7 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
 #endif
 
 #if 0
-   printf("flushed tiles in use: %d\n", inuse);
+   debug_printf("flushed tiles in use: %d\n", inuse);
 #endif
 }
 
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
index dcc39362a9..463ff0d9da 100644
--- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
@@ -143,7 +143,7 @@ tgsi_exec_prepare( struct tgsi_exec_machine *mach )
 
    k = tgsi_parse_init( &parse, mach->Tokens );
    if (k != TGSI_PARSE_OK) {
-      fprintf(stderr, "Problem parsing!\n");
+      debug_printf("Problem parsing!\n");
       return;
    }
 
@@ -249,7 +249,7 @@ tgsi_exec_machine_init(
 
    k = tgsi_parse_init (&parse, mach->Tokens);
    if (k != TGSI_PARSE_OK) {
-      fprintf( stderr, "Problem parsing!\n" );
+      debug_printf( "Problem parsing!\n" );
       return;
    }
 
@@ -1236,7 +1236,7 @@ exec_tex(struct tgsi_exec_machine *mach,
    uint chan_index;
    float lodBias;
 
-   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
 
    switch (inst->InstructionExtTexture.Texture) {
    case TGSI_TEXTURE_1D:
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
index df0c698301..f2180082f1 100755
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -48,28 +48,28 @@ _print_reg(
    case file_REG32:
       switch( reg.idx ) {
       case reg_AX:
-         printf( "EAX" );
+         debug_printf( "EAX" );
          break;
       case reg_CX:
-         printf( "ECX" );
+         debug_printf( "ECX" );
          break;
       case reg_DX:
-         printf( "EDX" );
+         debug_printf( "EDX" );
          break;
       case reg_BX:
-         printf( "EBX" );
+         debug_printf( "EBX" );
          break;
       case reg_SP:
-         printf( "ESP" );
+         debug_printf( "ESP" );
          break;
       case reg_BP:
-         printf( "EBP" );
+         debug_printf( "EBP" );
          break;
       case reg_SI:
-         printf( "ESI" );
+         debug_printf( "ESI" );
          break;
       case reg_DI:
-         printf( "EDI" );
+         debug_printf( "EDI" );
          break;
       }
       break;
@@ -77,7 +77,7 @@ _print_reg(
       assert( 0 );
       break;
    case file_XMM:
-      printf( "XMM%u", reg.idx );
+      debug_printf( "XMM%u", reg.idx );
       break;
    case file_x87:
       assert( 0 );
@@ -92,35 +92,35 @@ _fill(
    unsigned count = 10 - strlen( op );
 
    while( count-- ) {
-      printf( " " );
+      debug_printf( " " );
    }
 }
 
-#define DUMP_START() printf( "\nsse-dump start ----------------" )
-#define DUMP_END() printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) printf( "\n%s", OP )
+#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
+#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
+#define DUMP( OP ) debug_printf( "\n%s", OP )
 #define DUMP_I( OP, I ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
-   printf( "%u", I ); } while( 0 )
+   debug_printf( "%u", I ); } while( 0 )
 #define DUMP_R( OP, R0 ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 ); } while( 0 )
 #define DUMP_RR( OP, R0, R1 ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 );\
-   printf( ", " );\
+   debug_printf( ", " );\
    _print_reg( R1 ); } while( 0 )
 #define DUMP_RRI( OP, R0, R1, I ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 );\
-   printf( ", " );\
+   debug_printf( ", " );\
    _print_reg( R1 );\
-   printf( ", " );\
-   printf( "%u", I ); } while( 0 )
+   debug_printf( ", " );\
+   debug_printf( "%u", I ); } while( 0 )
 
 #else
 
diff --git a/src/mesa/pipe/tgsi/util/tgsi_build.c b/src/mesa/pipe/tgsi/util/tgsi_build.c
index 67f7d2c2c2..a00ff1c2a5 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_build.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_build.c
@@ -1,3 +1,4 @@
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_build.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_dump.c b/src/mesa/pipe/tgsi/util/tgsi_dump.c
index cdbc0dbc9c..b5c54847e0 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_dump.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_dump.c
@@ -25,6 +25,9 @@
  * 
  **************************************************************************/
 
+#include <stdio.h> 
+
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_dump.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_parse.c b/src/mesa/pipe/tgsi/util/tgsi_parse.c
index f0f8d44ac2..bf6b89ce56 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_parse.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_parse.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_util.c b/src/mesa/pipe/tgsi/util/tgsi_util.c
index 1e76b0f133..4cdd89182a 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_util.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_util.c
@@ -1,3 +1,4 @@
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
-- 
cgit v1.2.3


From bf3101afdc5d315f1fb42eb74ec1b8b0d4101aae Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 7 Feb 2008 01:07:49 +0900
Subject: gallium: Bring latest fixes.

---
 src/mesa/pipe/p_debug.h      |  9 ++++++++-
 src/mesa/pipe/util/p_debug.c | 16 +++++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/mesa/pipe/p_debug.h b/src/mesa/pipe/p_debug.h
index b037eba2a3..2a11627b36 100644
--- a/src/mesa/pipe/p_debug.h
+++ b/src/mesa/pipe/p_debug.h
@@ -38,6 +38,10 @@
 #ifndef P_DEBUG_H_
 #define P_DEBUG_H_
 
+
+#include <stdarg.h>
+
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -55,8 +59,12 @@ extern "C" {
 
 
 void debug_printf(const char *format, ...);
+
+void debug_vprintf(const char *format, va_list ap);
+
 void debug_assert_fail(const char *expr, const char *file, unsigned line);
 
+
 /** Assert macro */
 #ifdef DEBUG
 #define debug_assert(expr) ((expr) ? (void)0 : debug_assert_fail(#expr, __FILE__, __LINE__))
@@ -66,7 +74,6 @@ void debug_assert_fail(const char *expr, const char *file, unsigned line);
 
 
 #ifdef assert
-#warning Standard C Library assert macro usage detected. 
 #undef assert
 #endif
 #define assert(expr) debug_assert(expr)
diff --git a/src/mesa/pipe/util/p_debug.c b/src/mesa/pipe/util/p_debug.c
index 9303c970cc..b9607a6ba7 100644
--- a/src/mesa/pipe/util/p_debug.c
+++ b/src/mesa/pipe/util/p_debug.c
@@ -40,16 +40,22 @@
 #include "pipe/p_compiler.h" 
 
 
-void debug_printf(const char *format, ...)
+void debug_vprintf(const char *format, va_list ap)
 {
-   va_list ap;
-   va_start( ap, format );  
 #ifdef WIN32
    EngDebugPrint("Gallium3D: ", (PCHAR)format, ap);
 #else
    vfprintf(stderr, format, ap);
 #endif
-   va_end( ap );
+}
+
+
+void debug_printf(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   debug_vprintf(format, ap);
+   va_end(ap);
 }
 
 
@@ -65,6 +71,6 @@ static INLINE void debug_abort(void)
 
 void debug_assert_fail(const char *expr, const char *file, unsigned line) 
 {
-   debug_printf("%s:%i: Assertion `%s' failed.");
+   debug_printf("%s:%i: Assertion `%s' failed.\n", file, line, expr);
    debug_abort();
 }
-- 
cgit v1.2.3


From 61e59234d072ce78770047f9f08e0bb92e2fb1c5 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 16:32:15 -0700
Subject: gallium: add bitmap/drawpixels texcoord bias support

The state tracker will call pipe->get_paramf(PIPE_CAP_BITMAP_TEXCOORD_BIAS)
to get a bias factor for adjusting the texcoords used in bitmap/drawpixels.
This allows us to compensate for small differences in rasterization from
one device to another.
---
 src/mesa/pipe/p_defines.h                 | 2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c | 7 +++++--
 src/mesa/state_tracker/st_context.h       | 2 ++
 src/mesa/state_tracker/st_extensions.c    | 3 +++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/mesa/pipe/p_defines.h b/src/mesa/pipe/p_defines.h
index 85adf2d61d..0bf53ecb79 100644
--- a/src/mesa/pipe/p_defines.h
+++ b/src/mesa/pipe/p_defines.h
@@ -265,6 +265,6 @@ enum pipe_texture_target {
 #define PIPE_CAP_MAX_POINT_WIDTH_AA      17
 #define PIPE_CAP_MAX_TEXTURE_ANISOTROPY  18
 #define PIPE_CAP_MAX_TEXTURE_LOD_BIAS    19
-
+#define PIPE_CAP_BITMAP_TEXCOORD_BIAS    20
 
 #endif
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 6b44cba2e4..34d420fcff 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -581,10 +581,13 @@ draw_quad_colored(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
                   GLfloat x1, GLfloat y1, const GLfloat *color,
                   GLboolean invertTex)
 {
+   GLfloat bias = ctx->st->bitmap_texcoord_bias;
    GLfloat verts[4][3][4]; /* four verts, three attribs, XYZW */
    GLuint i;
-   GLfloat sLeft = 0.0, sRight = 1.0;
-   GLfloat tTop = invertTex, tBot = 1.0 - tTop;
+   GLfloat xBias = bias / (x1-x0);
+   GLfloat yBias = bias / (y1-y0);
+   GLfloat sLeft = 0.0 + xBias, sRight = 1.0 + xBias;
+   GLfloat tTop = invertTex - yBias, tBot = 1.0 - tTop - yBias;
 
    /* upper-left */
    verts[0][0][0] = x0;    /* attr[0].x */
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 2b6f8743f3..a756055898 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -143,6 +143,8 @@ struct st_context
 
    GLfloat polygon_offset_scale; /* ?? */
 
+   GLfloat bitmap_texcoord_bias;
+
    /** Mapping from VERT_RESULT_x to post-transformed vertex slot */
    const GLuint *vertex_result_to_slot;
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 0157bdd6b3..97d28d77c4 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -106,6 +106,9 @@ void st_init_limits(struct st_context *st)
 
    c->MaxTextureLodBias
       = pipe->get_paramf(pipe, PIPE_CAP_MAX_TEXTURE_LOD_BIAS);
+
+   st->bitmap_texcoord_bias
+      = pipe->get_paramf(pipe, PIPE_CAP_BITMAP_TEXCOORD_BIAS);
 }
 
 
-- 
cgit v1.2.3


From 35caa43e68a20b39574a740e420bbca7be3b7649 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 17:46:31 -0700
Subject: clean-up

---
 src/mesa/state_tracker/st_atom_texture.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index fb21d29c40..b3859f18cb 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -36,7 +36,6 @@
 #include "st_atom.h"
 #include "st_cb_texture.h"
 #include "pipe/p_context.h"
-#include "pipe/p_defines.h"
 
 
 /**
@@ -46,18 +45,14 @@
 static void 
 update_textures(struct st_context *st)
 {
-   GLuint s;
-
    /* ST_NEW_FRAGMENT_PROGRAM
     */
    struct gl_fragment_program *fprog = st->ctx->FragmentProgram._Current;
+   GLuint unit;
 
-   for (s = 0; s < st->ctx->Const.MaxTextureCoordUnits; s++) {
-      GLuint su = fprog->Base.SamplerUnits[s];
-      
-      struct gl_texture_object *texObj
-         = st->ctx->Texture.Unit[su]._Current;
-
+   for (unit = 0; unit < st->ctx->Const.MaxTextureCoordUnits; unit++) {
+      const GLuint su = fprog->Base.SamplerUnits[unit];
+      struct gl_texture_object *texObj = st->ctx->Texture.Unit[su]._Current;
       struct pipe_texture *pt;
 
       if (texObj) {
@@ -75,9 +70,9 @@ update_textures(struct st_context *st)
        * this table before being deleted, otherwise the pointer
        * comparison below could fail.
        */
-      if (st->state.sampler_texture[s] != pt) {
-	 st->state.sampler_texture[s] = pt;
-	 st->pipe->set_sampler_texture(st->pipe, s, pt);
+      if (st->state.sampler_texture[unit] != pt) {
+	 st->state.sampler_texture[unit] = pt;
+	 st->pipe->set_sampler_texture(st->pipe, unit, pt);
       }
    }
 }
-- 
cgit v1.2.3


From e232b3ffc18d5b89f9472db6b96499d8a8907b63 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 17:55:16 -0700
Subject: comments, clean-ups, consts

---
 src/mesa/state_tracker/st_texture.c | 30 ++++++++++++++++++------------
 src/mesa/state_tracker/st_texture.h |  8 ++++----
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 15cc458be8..741f36c2a7 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -59,6 +59,10 @@ target_to_target(GLenum target)
 }
 #endif
 
+
+/**
+ * Allocate a new pipe_texture object
+ */
 struct pipe_texture *
 st_texture_create(struct st_context *st,
                   enum pipe_texture_target target,
@@ -100,17 +104,16 @@ st_texture_create(struct st_context *st,
 }
 
 
-
-
-/* Can the image be pulled into a unified mipmap texture.  This mirrors
- * the completeness test in a lot of ways.
+/**
+ * Check if a texture image be pulled into a unified mipmap texture.
+ * This mirrors the completeness test in a lot of ways.
  *
  * Not sure whether I want to pass gl_texture_image here.
  */
 GLboolean
-st_texture_match_image(struct pipe_texture *pt,
-                          struct gl_texture_image *image,
-                          GLuint face, GLuint level)
+st_texture_match_image(const struct pipe_texture *pt,
+                       const struct gl_texture_image *image,
+                       GLuint face, GLuint level)
 {
    /* Images with borders are never pulled into mipmap textures. 
     */
@@ -189,6 +192,7 @@ st_texture_image_map(struct st_context *st, struct st_texture_image *stImage,
    return pipe_surface_map(stImage->surface);
 }
 
+
 void
 st_texture_image_unmap(struct st_texture_image *stImage)
 {
@@ -201,7 +205,8 @@ st_texture_image_unmap(struct st_texture_image *stImage)
 
 
-/* Upload data to a rectangular sub-region.  Lots of choices how to do this:
+/**
+ * Upload data to a rectangular sub-region.  Lots of choices how to do this:
  *
  * - memcpy by span to current destination
  * - upload data as new buffer and blit
@@ -261,13 +266,14 @@ st_texture_image_data(struct pipe_context *pipe,
    }
 }
 
+
 /* Copy mipmap image between textures
  */
 void
 st_texture_image_copy(struct pipe_context *pipe,
-                         struct pipe_texture *dst,
-                         GLuint face, GLuint level,
-                         struct pipe_texture *src)
+                      struct pipe_texture *dst,
+                      GLuint face, GLuint level,
+                      struct pipe_texture *src)
 {
    GLuint width = src->width[level];
    GLuint height = src->height[level];
@@ -278,6 +284,7 @@ st_texture_image_copy(struct pipe_context *pipe,
 
    if (dst->compressed)
       height /= 4;
+
    for (i = 0; i < depth; i++) {
       dst_surface = pipe->get_tex_surface(pipe, dst, face, level, i);
       src_surface = pipe->get_tex_surface(pipe, src, face, level, i);
@@ -292,5 +299,4 @@ st_texture_image_copy(struct pipe_context *pipe,
       pipe_surface_reference(&dst_surface, NULL);
       pipe_surface_reference(&src_surface, NULL);
    }
-
 }
diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
index d8b1bcad9d..0b87a494c3 100644
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -47,11 +47,11 @@ st_texture_create(struct st_context *st,
                   GLuint compress_byte);
 
 
-/* Check if an image fits an existing texture
+/* Check if an image fits into an existing texture object.
  */
 extern GLboolean
-st_texture_match_image(struct pipe_texture *pt,
-                       struct gl_texture_image *image,
+st_texture_match_image(const struct pipe_texture *pt,
+                       const struct gl_texture_image *image,
                        GLuint face, GLuint level);
 
 /* Return a pointer to an image within a texture.  Return image stride as
@@ -73,7 +73,7 @@ extern const GLuint *
 st_texture_depth_offsets(struct pipe_texture *pt, GLuint level);
 
 
-/* Return the linear offset of an image relative to the start of its region:
+/* Return the linear offset of an image relative to the start of its region.
  */
 extern GLuint
 st_texture_image_offset(const struct pipe_texture *pt,
-- 
cgit v1.2.3


From 1ad145a950827e67eee4c5892cea8be9a9cf0dc7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:01:18 -0700
Subject: added comment

---
 src/mesa/state_tracker/st_atom_texture.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index b3859f18cb..2a836d630b 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -59,6 +59,7 @@ update_textures(struct st_context *st)
          GLboolean flush, retval;
 
          retval = st_finalize_texture(st->ctx, st->pipe, texObj, &flush);
+         /* XXX retval indicates whether there's a texture border */
 
          pt = st_get_texobj_texture(texObj);
       }
-- 
cgit v1.2.3


From 20aa31a2447a4bda378bf3d2d78c078d748b8271 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:13:28 -0700
Subject: clean-ups

---
 src/mesa/state_tracker/st_cb_texture.c | 67 +++++++++++++++-------------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index ba0950e295..eee94baa20 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -76,13 +76,13 @@ struct st_texture_object
 
 
-
 static INLINE struct st_texture_object *
 st_texture_object(struct gl_texture_object *obj)
 {
    return (struct st_texture_object *) obj;
 }
 
+
 static INLINE struct st_texture_image *
 st_texture_image(struct gl_texture_image *img)
 {
@@ -122,32 +122,28 @@ gl_target_to_pipe(GLenum target)
 }
 
 
+/**
+ * Return nominal bytes per texel for a compressed format, 0 for non-compressed
+ * format.
+ */
 static int
 compressed_num_bytes(GLuint mesaFormat)
 {
-   int bytes = 0;
    switch(mesaFormat) {
-     
    case MESA_FORMAT_RGB_FXT1:
    case MESA_FORMAT_RGBA_FXT1:
    case MESA_FORMAT_RGB_DXT1:
    case MESA_FORMAT_RGBA_DXT1:
-     bytes = 2;
-     break;
-     
+      return 2;
    case MESA_FORMAT_RGBA_DXT3:
    case MESA_FORMAT_RGBA_DXT5:
-     bytes = 4;
+      return 4;
    default:
-     break;
+      return 0;
    }
-   
-   return bytes;
 }
 
 
-
-
 static GLboolean
 st_IsTextureResident(GLcontext * ctx, struct gl_texture_object *texObj)
 {
@@ -164,7 +160,6 @@ st_IsTextureResident(GLcontext * ctx, struct gl_texture_object *texObj)
 }
 
 
-
 static struct gl_texture_image *
 st_NewTextureImage(GLcontext * ctx)
 {
@@ -216,8 +211,6 @@ st_FreeTextureImageData(GLcontext * ctx, struct gl_texture_image *texImage)
 }
 
 
-
-
 /* ================================================================
  * From linux kernel i386 header files, copes with odd sizes better
  * than COPY_DWORDS would:
@@ -302,7 +295,7 @@ logbase2(int n)
 static void
 guess_and_alloc_texture(struct st_context *st,
 			struct st_texture_object *stObj,
-			struct st_texture_image *stImage)
+			const struct st_texture_image *stImage)
 {
    GLuint firstLevel;
    GLuint lastLevel;
@@ -487,21 +480,18 @@ try_pbo_upload(GLcontext *ctx,
 
 
-
-
-
-
 static void
 st_TexImage(GLcontext * ctx,
-              GLint dims,
-              GLenum target, GLint level,
-              GLint internalFormat,
-              GLint width, GLint height, GLint depth,
-              GLint border,
-              GLenum format, GLenum type, const void *pixels,
-              const struct gl_pixelstore_attrib *unpack,
-              struct gl_texture_object *texObj,
-              struct gl_texture_image *texImage, GLsizei imageSize, int compressed)
+            GLint dims,
+            GLenum target, GLint level,
+            GLint internalFormat,
+            GLint width, GLint height, GLint depth,
+            GLint border,
+            GLenum format, GLenum type, const void *pixels,
+            const struct gl_pixelstore_attrib *unpack,
+            struct gl_texture_object *texObj,
+            struct gl_texture_image *texImage,
+            GLsizei imageSize, int compressed)
 {
    struct st_texture_object *stObj = st_texture_object(texObj);
    struct st_texture_image *stImage = st_texture_image(texImage);
@@ -524,7 +514,7 @@ st_TexImage(GLcontext * ctx,
 
    /* choose the texture format */
    texImage->TexFormat = st_ChooseTextureFormat(ctx, internalFormat,
-                                                  format, type);
+                                                format, type);
 
    _mesa_set_fetch_functions(texImage, dims);
 
@@ -536,7 +526,8 @@ st_TexImage(GLcontext * ctx,
 	 ctx->Driver.CompressedTextureSize(ctx, texImage->Width,
 					   texImage->Height, texImage->Depth,
 					   texImage->TexFormat->MesaFormat);
-   } else {
+   }
+   else {
       texelBytes = texImage->TexFormat->TexelBytes;
       
       /* Minimum pitch of 32 bytes */
@@ -669,7 +660,7 @@ st_TexImage(GLcontext * ctx,
     * conversion and copy:
     */
    if (compressed) {
-     memcpy(texImage->Data, pixels, imageSize);
+      memcpy(texImage->Data, pixels, imageSize);
    }
    else {
       GLuint srcImageStride = _mesa_image_image_stride(unpack, width, height,
@@ -1401,7 +1392,10 @@ copy_image_data_to_texture(struct st_context *st,
 }
 
 
-/*  
+/**
+ * Called during state validation.  When this function is finished,
+ * the texture object should be ready for rendering.
+ * \return GL_FALSE if a texture border is present, GL_TRUE otherwise
  */
 GLboolean
 st_finalize_texture(GLcontext *ctx,
@@ -1410,11 +1404,10 @@ st_finalize_texture(GLcontext *ctx,
 		    GLboolean *needFlush)
 {
    struct st_texture_object *stObj = st_texture_object(tObj);
+   const GLuint nr_faces = (stObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
    int comp_byte = 0;
    int cpp;
-
    GLuint face, i;
-   GLuint nr_faces = 0;
    struct st_texture_image *firstImage;
 
    *needFlush = GL_FALSE;
@@ -1426,8 +1419,7 @@ st_finalize_texture(GLcontext *ctx,
    /* What levels must the texture include at a minimum?
     */
    calculate_first_last_level(stObj);
-   firstImage =
-      st_texture_image(stObj->base.Image[0][stObj->firstLevel]);
+   firstImage = st_texture_image(stObj->base.Image[0][stObj->firstLevel]);
 
    /* Fallback case:
     */
@@ -1503,7 +1495,6 @@ st_finalize_texture(GLcontext *ctx,
 
    /* Pull in any images not in the object's texture:
     */
-   nr_faces = (stObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
    for (face = 0; face < nr_faces; face++) {
       for (i = stObj->firstLevel; i <= stObj->lastLevel; i++) {
          struct st_texture_image *stImage =
-- 
cgit v1.2.3


From 08ffa00d15c4871a22f0670a8aacd7a3995a6769 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:15:03 -0700
Subject: Added ctx->Driver.GenerateMipmap() driver hook

---
 src/mesa/drivers/common/driverfuncs.c |  2 ++
 src/mesa/main/dd.h                    |  7 +++++
 src/mesa/main/fbobject.c              |  2 +-
 src/mesa/main/texstore.c              | 48 +++++++++++++++++------------------
 4 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 33caf7dae1..b5b383b4e4 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -28,6 +28,7 @@
 #include "buffers.h"
 #include "context.h"
 #include "framebuffer.h"
+#include "mipmap.h"
 #include "program.h"
 #include "prog_execute.h"
 #include "queryobj.h"
@@ -99,6 +100,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->CopyTexSubImage1D = _swrast_copy_texsubimage1d;
    driver->CopyTexSubImage2D = _swrast_copy_texsubimage2d;
    driver->CopyTexSubImage3D = _swrast_copy_texsubimage3d;
+   driver->GenerateMipmap = _mesa_generate_mipmap;
    driver->TestProxyTexImage = _mesa_test_proxy_teximage;
    driver->CompressedTexImage1D = _mesa_store_compressed_teximage1d;
    driver->CompressedTexImage2D = _mesa_store_compressed_teximage2d;
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 3bec3bd433..c2ef67ba6d 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -332,6 +332,13 @@ struct dd_function_table {
                               GLint x, GLint y,
                               GLsizei width, GLsizei height );
 
+   /**
+    * Called by glGenerateMipmap() or when GL_GENERATE_MIPMAP_SGIS is enabled.
+    */
+   void (*GenerateMipmap)(GLcontext *ctx,  GLenum target,
+                          const struct gl_texture_unit *texUnit,
+                          struct gl_texture_object *texObj);
+
    /**
     * Called by glTexImage[123]D when user specifies a proxy texture
     * target.  
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 963e35d678..13cbd35424 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1560,7 +1560,7 @@ _mesa_GenerateMipmapEXT(GLenum target)
 
    /* XXX this might not handle cube maps correctly */
    _mesa_lock_texture(ctx, texObj);
-   _mesa_generate_mipmap(ctx, target, texUnit, texObj);
+   ctx->Driver.GenerateMipmap(ctx, target, texUnit, texObj);
    _mesa_unlock_texture(ctx, texObj);
 }
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 30be65525e..26ca4f1bd5 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -2917,9 +2917,9 @@ _mesa_store_teximage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3003,9 +3003,9 @@ _mesa_store_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3079,9 +3079,9 @@ _mesa_store_teximage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3127,9 +3127,9 @@ _mesa_store_texsubimage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3182,9 +3182,9 @@ _mesa_store_texsubimage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3237,9 +3237,9 @@ _mesa_store_texsubimage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3313,9 +3313,9 @@ _mesa_store_compressed_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
@@ -3425,9 +3425,9 @@ _mesa_store_compressed_texsubimage2d(GLcontext *ctx, GLenum target,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
-- 
cgit v1.2.3


From 30c9e12f8d20a950b39577ea2b4a2e7a850dfe10 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:15:39 -0700
Subject: plug in ctx->Driver.GenerateMipmap function

---
 src/mesa/state_tracker/st_cb_texture.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index eee94baa20..15c5359360 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -30,6 +30,7 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/macros.h"
+#include "main/mipmap.h"
 #include "main/texcompress.h"
 #include "main/texformat.h"
 #include "main/teximage.h"
@@ -1531,6 +1532,7 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage1D = st_CopyTexSubImage1D;
    functions->CopyTexSubImage2D = st_CopyTexSubImage2D;
    functions->CopyTexSubImage3D = st_CopyTexSubImage3D;
+   functions->GenerateMipmap = _mesa_generate_mipmap;
 
    functions->GetTexImage = st_GetTexImage;
 
-- 
cgit v1.2.3


From 20f16a6ae44f55c2efa03a2c9deb07b66fe1b0e0 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 08:44:19 -0700
Subject: clean-ups in guess_and_alloc_texture()

---
 src/mesa/state_tracker/st_cb_texture.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 15c5359360..8db4a804ab 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -284,7 +284,12 @@ logbase2(int n)
 }
 
 
-/* Otherwise, store it in memory if (Border != 0) or (any dimension ==
+/**
+ * Allocate a pipe_texture object for the given st_texture_object using
+ * the given st_texture_image to guess the mipmap size/levels.
+ *
+ * [comments...]
+ * Otherwise, store it in memory if (Border != 0) or (any dimension ==
  * 1).
  *    
  * Otherwise, if max_level >= level >= min_level, create texture with
@@ -303,11 +308,12 @@ guess_and_alloc_texture(struct st_context *st,
    GLuint width = stImage->base.Width;
    GLuint height = stImage->base.Height;
    GLuint depth = stImage->base.Depth;
-   GLuint l2width, l2height, l2depth;
    GLuint i, comp_byte = 0;
 
    DBG("%s\n", __FUNCTION__);
 
+   assert(!stObj->pt);
+
    if (stImage->base.Border)
       return;
 
@@ -349,15 +355,15 @@ guess_and_alloc_texture(struct st_context *st,
       lastLevel = firstLevel;
    }
    else {
-      l2width = logbase2(width);
-      l2height = logbase2(height);
-      l2depth = logbase2(depth);
+      GLuint l2width = logbase2(width);
+      GLuint l2height = logbase2(height);
+      GLuint l2depth = logbase2(depth);
       lastLevel = firstLevel + MAX2(MAX2(l2width, l2height), l2depth);
    }
 
-   assert(!stObj->pt);
    if (stImage->base.IsCompressed)
       comp_byte = compressed_num_bytes(stImage->base.TexFormat->MesaFormat);
+
    stObj->pt = st_texture_create(st,
                                  gl_target_to_pipe(stObj->base.Target),
                                  st_mesa_format_to_pipe_format(stImage->base.TexFormat->MesaFormat),
-- 
cgit v1.2.3


From 64ca0678eeeb39831fcfb309ac48561b1981d360 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:24:30 -0700
Subject: gallium: change pipe->texture_create() to operate like the CSO
 functions

Now, pass in a template object and return a new object.
---
 src/mesa/pipe/cell/ppu/cell_texture.c     | 31 ++++++++++++++---------------
 src/mesa/pipe/cell/ppu/cell_texture.h     |  5 +++--
 src/mesa/pipe/i915simple/i915_texture.c   | 17 ++++++++--------
 src/mesa/pipe/i915simple/i915_texture.h   |  5 +++--
 src/mesa/pipe/i965simple/brw_tex_layout.c | 15 +++++++-------
 src/mesa/pipe/i965simple/brw_tex_layout.h |  4 ++--
 src/mesa/pipe/p_context.h                 |  4 ++--
 src/mesa/pipe/softpipe/sp_texture.c       | 33 +++++++++++++++----------------
 src/mesa/pipe/softpipe/sp_texture.h       |  5 +++--
 src/mesa/state_tracker/st_texture.c       | 31 ++++++++++++-----------------
 10 files changed, 73 insertions(+), 77 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index 2cf6022939..df178d9ca2 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -79,31 +79,30 @@ cell_texture_layout(struct cell_texture * spt)
 }
 
 
-void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct cell_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct cell_texture));
+   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
+   if (!spt)
+      return NULL;
 
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct cell_texture) - sizeof(struct pipe_texture));
+   spt->base = *templat;
 
-      cell_texture_layout(spt);
+   cell_texture_layout(spt);
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h
index bd434c8776..0264fed88e 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.h
+++ b/src/mesa/pipe/cell/ppu/cell_texture.h
@@ -60,8 +60,9 @@ cell_texture(struct pipe_texture *pt)
 
 
-extern void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i915simple/i915_texture.c b/src/mesa/pipe/i915simple/i915_texture.c
index 61944fe7d9..6faeab134a 100644
--- a/src/mesa/pipe/i915simple/i915_texture.c
+++ b/src/mesa/pipe/i915simple/i915_texture.c
@@ -477,17 +477,17 @@ i945_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
    return TRUE;
 }
 
-void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat)
 {
-   struct i915_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-				      sizeof(struct i915_texture));
+   struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
 
    if (tex) {
       struct i915_context *i915 = i915_context(pipe);
 
-      memset(&tex->base + 1, 0,
-	     sizeof(struct i915_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (i915->flags.is_i945 ? i945_miptree_layout(pipe, tex) :
 	  i915_miptree_layout(pipe, tex))
@@ -498,13 +498,14 @@ i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+	 return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
+
 void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/i915simple/i915_texture.h b/src/mesa/pipe/i915simple/i915_texture.h
index 84a0502e81..330d111dc7 100644
--- a/src/mesa/pipe/i915simple/i915_texture.h
+++ b/src/mesa/pipe/i915simple/i915_texture.h
@@ -6,8 +6,9 @@ struct pipe_context;
 struct pipe_texture;
 
 
-extern void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.c b/src/mesa/pipe/i965simple/brw_tex_layout.c
index b8b6b579e2..405fd1f794 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.c
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.c
@@ -299,15 +299,14 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
    return TRUE;
 }
 
-void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct brw_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-                                     sizeof(struct brw_texture));
+   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
 
    if (tex) {
-      memset(&tex->base + 1, 0,
-	     sizeof(struct brw_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (brw_miptree_layout(pipe, tex))
 	 tex->buffer = pipe->winsys->buffer_create(pipe->winsys, 64,
@@ -317,11 +316,11 @@ brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+         return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
 void
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.h b/src/mesa/pipe/i965simple/brw_tex_layout.h
index 15e275058a..cfd6b1ef3a 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.h
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.h
@@ -6,8 +6,8 @@
 struct pipe_context;
 struct pipe_texture;
 
-extern void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat);
 
 extern void
 brw_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/p_context.h b/src/mesa/pipe/p_context.h
index 0dda06c53b..92a1cd70c4 100644
--- a/src/mesa/pipe/p_context.h
+++ b/src/mesa/pipe/p_context.h
@@ -199,8 +199,8 @@ struct pipe_context {
    /*
     * Texture functions
     */
-   void (*texture_create)(struct pipe_context *pipe,
-			  struct pipe_texture **pt);
+   struct pipe_texture * (*texture_create)(struct pipe_context *pipe,
+                                           const struct pipe_texture *templat);
 
    void (*texture_release)(struct pipe_context *pipe,
 			   struct pipe_texture **pt);
diff --git a/src/mesa/pipe/softpipe/sp_texture.c b/src/mesa/pipe/softpipe/sp_texture.c
index 172234843d..fd2cc3dbbb 100644
--- a/src/mesa/pipe/softpipe/sp_texture.c
+++ b/src/mesa/pipe/softpipe/sp_texture.c
@@ -79,31 +79,30 @@ softpipe_texture_layout(struct softpipe_texture * spt)
 }
 
 
-void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat)
 {
-   struct softpipe_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct softpipe_texture));
-
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct softpipe_texture) - sizeof(struct pipe_texture));
+   struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
 
-      softpipe_texture_layout(spt);
+   spt->base = *templat;
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   softpipe_texture_layout(spt);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/softpipe/sp_texture.h b/src/mesa/pipe/softpipe/sp_texture.h
index c6cf370351..fa646c0de9 100644
--- a/src/mesa/pipe/softpipe/sp_texture.h
+++ b/src/mesa/pipe/softpipe/sp_texture.h
@@ -55,8 +55,9 @@ softpipe_texture(struct pipe_texture *pt)
 
 
-extern void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat);
 
 extern void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 741f36c2a7..844a9f80d8 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -74,7 +74,7 @@ st_texture_create(struct st_context *st,
 		  GLuint depth0,
 		  GLuint compress_byte)
 {
-   struct pipe_texture *pt = CALLOC_STRUCT(pipe_texture);
+   struct pipe_texture pt;
 
    assert(target <= PIPE_TEXTURE_CUBE);
 
@@ -82,25 +82,20 @@ st_texture_create(struct st_context *st,
        _mesa_lookup_enum_by_nr(target),
        _mesa_lookup_enum_by_nr(format), first_level, last_level);
 
-   if (!pt)
-      return NULL;
-
    assert(format);
 
-   pt->target = target;
-   pt->format = format;
-   pt->first_level = first_level;
-   pt->last_level = last_level;
-   pt->width[0] = width0;
-   pt->height[0] = height0;
-   pt->depth[0] = depth0;
-   pt->compressed = compress_byte ? 1 : 0;
-   pt->cpp = pt->compressed ? compress_byte : st_sizeof_format(format);
-   pt->refcount = 1; 
-
-   st->pipe->texture_create(st->pipe, &pt);
-
-   return pt;
+   pt.target = target;
+   pt.format = format;
+   pt.first_level = first_level;
+   pt.last_level = last_level;
+   pt.width[0] = width0;
+   pt.height[0] = height0;
+   pt.depth[0] = depth0;
+   pt.compressed = compress_byte ? 1 : 0;
+   pt.cpp = pt.compressed ? compress_byte : st_sizeof_format(format);
+   pt.refcount = 1; 
+
+   return st->pipe->texture_create(st->pipe, &pt);
 }
 
 
-- 
cgit v1.2.3


From 67155e8edff96f0a6e85580b0753041e67d3f99d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:28:20 -0700
Subject: gallium: added mem_dup()

---
 src/mesa/pipe/p_util.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index 4780ed7818..991ac447ba 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -183,6 +183,20 @@ align_free(void *ptr)
 
 
+/**
+ * Duplicate of a block of memory
+ */
+static INLINE void *
+mem_dup(const void *src, uint size)
+{
+   void *dup = malloc(size);
+   if (dup)
+      memcpy(dup, src, size);
+   return dup;
+}
+
+
+
 #define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-- 
cgit v1.2.3


From 2a3b31c1c4e73529c93476161973dae13b31aee5 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:29:00 -0700
Subject: gallium: use mem_dup()

---
 src/mesa/pipe/softpipe/sp_state_blend.c      | 10 +++-------
 src/mesa/pipe/softpipe/sp_state_rasterizer.c |  7 ++-----
 src/mesa/pipe/softpipe/sp_state_sampler.c    |  4 +---
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/src/mesa/pipe/softpipe/sp_state_blend.c b/src/mesa/pipe/softpipe/sp_state_blend.c
index 160ca5cbc0..2d40d6bd8f 100644
--- a/src/mesa/pipe/softpipe/sp_state_blend.c
+++ b/src/mesa/pipe/softpipe/sp_state_blend.c
@@ -32,13 +32,12 @@
 #include "sp_context.h"
 #include "sp_state.h"
 
+
 void *
 softpipe_create_blend_state(struct pipe_context *pipe,
                             const struct pipe_blend_state *blend)
 {
-   struct pipe_blend_state *state = MALLOC( sizeof(struct pipe_blend_state) );
-   memcpy(state, blend, sizeof(struct pipe_blend_state));
-   return state;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 void softpipe_bind_blend_state( struct pipe_context *pipe,
@@ -78,10 +77,7 @@ void *
 softpipe_create_depth_stencil_state(struct pipe_context *pipe,
 				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   struct pipe_depth_stencil_alpha_state *state =
-      MALLOC( sizeof(struct pipe_depth_stencil_alpha_state) );
-   memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state));
-   return state;
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
 }
 
 void
diff --git a/src/mesa/pipe/softpipe/sp_state_rasterizer.c b/src/mesa/pipe/softpipe/sp_state_rasterizer.c
index ce8fa4f2b8..53755099dd 100644
--- a/src/mesa/pipe/softpipe/sp_state_rasterizer.c
+++ b/src/mesa/pipe/softpipe/sp_state_rasterizer.c
@@ -35,12 +35,9 @@
 
 void *
 softpipe_create_rasterizer_state(struct pipe_context *pipe,
-                                 const struct pipe_rasterizer_state *setup)
+                                 const struct pipe_rasterizer_state *rast)
 {
-   struct pipe_rasterizer_state *state =
-      MALLOC( sizeof(struct pipe_rasterizer_state) );
-   memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
-   return state;
+   return mem_dup(rast, sizeof(*rast));
 }
 
 void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
diff --git a/src/mesa/pipe/softpipe/sp_state_sampler.c b/src/mesa/pipe/softpipe/sp_state_sampler.c
index 3842e71503..51b4b78287 100644
--- a/src/mesa/pipe/softpipe/sp_state_sampler.c
+++ b/src/mesa/pipe/softpipe/sp_state_sampler.c
@@ -40,9 +40,7 @@ void *
 softpipe_create_sampler_state(struct pipe_context *pipe,
                               const struct pipe_sampler_state *sampler)
 {
-   struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) );
-   memcpy(state, sampler, sizeof(struct pipe_sampler_state));
-   return state;
+   return mem_dup(sampler, sizeof(*sampler));
 }
 
 void
-- 
cgit v1.2.3


From 8ce9d29df9a8b0a43c60d946c85e2c871e12e911 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:35:33 -0700
Subject: Cell: use mem_dup()

---
 src/mesa/pipe/cell/ppu/cell_state_blend.c   | 9 ++-------
 src/mesa/pipe/cell/ppu/cell_state_sampler.c | 4 +---
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c
index 2c19aa3971..4fc60548c8 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_blend.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c
@@ -39,9 +39,7 @@ void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   struct pipe_blend_state *state = MALLOC(sizeof(struct pipe_blend_state));
-   memcpy(state, blend, sizeof(struct pipe_blend_state));
-   return state;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 
@@ -85,10 +83,7 @@ void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                  const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   struct pipe_depth_stencil_alpha_state *state =
-      MALLOC(sizeof(struct pipe_depth_stencil_alpha_state));
-   memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state));
-   return state;
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
 }
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
index 317f7603bb..ade6cc8338 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
@@ -40,9 +40,7 @@ void *
 cell_create_sampler_state(struct pipe_context *pipe,
                           const struct pipe_sampler_state *sampler)
 {
-   struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) );
-   memcpy(state, sampler, sizeof(struct pipe_sampler_state));
-   return state;
+   return mem_dup(sampler, sizeof(*sampler));
 }
 
 void
-- 
cgit v1.2.3


From 0af89a60b436efe74d9ac381516776438861fe52 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:48:37 -0700
Subject: gallium: #include p_debug.h since we use assert

---
 src/mesa/pipe/p_util.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index 991ac447ba..469920efee 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -29,6 +29,7 @@
 #define P_UTIL_H
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include <math.h>
 
 
-- 
cgit v1.2.3


From 1d05c41c5b98df9f2f24645b141365f211dbddb6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:48:56 -0700
Subject: Cell: silence unused var warnings

---
 src/mesa/pipe/cell/ppu/cell_state_fs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mesa/pipe/cell/ppu/cell_state_fs.c b/src/mesa/pipe/cell/ppu/cell_state_fs.c
index 81c2ac14dd..96a52273b0 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_fs.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_fs.c
@@ -45,7 +45,7 @@ void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
-   struct cell_context *cell = cell_context(pipe);
+   /*struct cell_context *cell = cell_context(pipe);*/
    struct cell_fragment_shader_state *state;
 
    state = CALLOC_STRUCT(cell_fragment_shader_state);
@@ -94,8 +94,6 @@ cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 void
 cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
-   struct cell_context *cell = cell_context(pipe);
-
    struct cell_fragment_shader_state *state =
       (struct cell_fragment_shader_state *) fs;
 
-- 
cgit v1.2.3


From 87c8a1ba36b5deb9a0009a45a0b7a3a345c16126 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 7 Feb 2008 19:59:17 +0900
Subject: gallium: Conditionally use posix libraries/includes.

---
 .gitignore |  1 +
 SConstruct | 43 ++++++++++++++++++++++---------------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/.gitignore b/.gitignore
index bf50291fc1..b5e59dfc3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ lib
 lib64
 .sconsign*
 config.py
+build
diff --git a/SConstruct b/SConstruct
index 47f9b5389b..22a4072c93 100644
--- a/SConstruct
+++ b/SConstruct
@@ -95,19 +95,8 @@ if gcc:
 	env.Append(CFLAGS = '-fmessage-length=0')
 	env.Append(CXXFLAGS = '-fmessage-length=0')
 
-# Defines
-env.Append(CPPDEFINES = [
-	'_POSIX_SOURCE',
-	('_POSIX_C_SOURCE', '199309L'), 
-	'_SVID_SOURCE',
-	'_BSD_SOURCE', 
-	'_GNU_SOURCE',
-	
-	'PTHREADS',
-	'HAVE_ALIAS', 
-	'HAVE_POSIX_MEMALIGN',
-])
 
+# Defines
 if debug:
 	env.Append(CPPDEFINES = ['DEBUG'])
 else:
@@ -120,8 +109,6 @@ env.Append(CPPPATH = [
 	'#/src/mesa',
 	'#/src/mesa/main',
 	'#/src/mesa/pipe',
-	
-	'/usr/X11R6/include',
 ])
 
 
@@ -137,14 +124,28 @@ if x86:
 		env.Append(CFLAGS = '-m32')
 		env.Append(CXXFLAGS = '-m32')
 
-env.Append(LIBPATH = ['/usr/X11R6/lib'])
 
-env.Append(LIBS = [
-	'm',
-	'pthread',
-	'expat',
-	'dl',
-])
+# Posix
+if platform in ('posix', 'linux', 'freebsd', 'darwin'):
+	env.Append(CPPDEFINES = [
+		'_POSIX_SOURCE',
+		('_POSIX_C_SOURCE', '199309L'), 
+		'_SVID_SOURCE',
+		'_BSD_SOURCE', 
+		'_GNU_SOURCE',
+		
+		'PTHREADS',
+		'HAVE_POSIX_MEMALIGN',
+	])
+	env.Append(CPPPATH = ['/usr/X11R6/include'])
+	env.Append(LIBPATH = ['/usr/X11R6/lib'])
+	env.Append(LIBS = [
+		'm',
+		'pthread',
+		'expat',
+		'dl',
+	])
+
 
 # DRI
 if dri:
-- 
cgit v1.2.3


From f41a4ee11ab3cbed8a4fd34866ed5771b8169ab7 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 7 Feb 2008 19:43:34 +0000
Subject: pipebuffer: Fix reversed assertion.

---
 src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
index bcd4b3e257..04477a865a 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
@@ -170,7 +170,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    struct list_head *item;
 
    assert(size == pool->bufSize);
-   assert(desc->alignment % pool->bufAlign == 0);
+   assert(pool->bufAlign % desc->alignment == 0);
    
    _glthread_LOCK_MUTEX(pool->mutex);
 
-- 
cgit v1.2.3


From 99c3c2d038074209686559c42d7314fcaaf3953a Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 7 Feb 2008 19:44:42 +0000
Subject: tgsi: Fall back to interpreter instead of assert(0) on unimplemented
 SSE code.

---
 src/mesa/pipe/draw/draw_vertex_shader.c | 22 ++++++++++++++++------
 src/mesa/pipe/tgsi/exec/tgsi_sse2.c     |  3 +--
 src/mesa/x86/rtasm/x86sse.c             |  1 +
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index e6590eafcc..5ca93aa615 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -121,11 +121,16 @@ run_vertex_program(struct draw_context *draw,
          = (struct draw_vertex_shader *)draw->vertex_shader;
       codegen_function func
          = (codegen_function) x86_get_func( &shader->sse2_program );
-      func(
-         machine->Inputs,
-         machine->Outputs,
-         machine->Consts,
-         machine->Temps );
+
+      if (func)
+         func(
+            machine->Inputs,
+            machine->Outputs,
+            machine->Consts,
+            machine->Temps );
+      else
+         /* interpreter */
+         tgsi_exec_machine_run( machine );
    }
    else
 #endif
@@ -269,7 +274,12 @@ draw_create_vertex_shader(struct draw_context *draw,
       struct pipe_shader_state *sh = (struct pipe_shader_state *) shader;
 
       x86_init_func( &vs->sse2_program );
-      tgsi_emit_sse2( (struct tgsi_token *) sh->tokens, &vs->sse2_program );
+      if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens,
+                           &vs->sse2_program )) {
+         x86_release_func( (struct x86_function *) &vs->sse2_program );
+	 fprintf(stdout /*err*/,
+		 "tgsi_emit_sse2() failed, falling back to interpreter\n");
+      }
    }
 #endif
 
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
index f2180082f1..40bacf8552 100755
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -2254,8 +2254,7 @@ tgsi_emit_sse2(
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
          /* XXX implement this */
-         assert(0);
-         break;
+         return 0;
 
       default:
          assert( 0 );
diff --git a/src/mesa/x86/rtasm/x86sse.c b/src/mesa/x86/rtasm/x86sse.c
index 56c211eee0..f8da6e405f 100644
--- a/src/mesa/x86/rtasm/x86sse.c
+++ b/src/mesa/x86/rtasm/x86sse.c
@@ -1137,6 +1137,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
 void x86_release_func( struct x86_function *p )
 {
    _mesa_exec_free(p->store);
+   p->store = NULL;
 }
 
 
-- 
cgit v1.2.3


From 1e2f5fb05a3f6720a1e7aa02e7ce12304991c6c7 Mon Sep 17 00:00:00 2001
From: Jerome Glisse <glisse@freedesktop.org>
Date: Fri, 8 Feb 2008 18:25:49 +0100
Subject: failover: several fixes to failover pipe module

---
 src/mesa/pipe/failover/fo_context.c |  2 +
 src/mesa/pipe/failover/fo_state.c   | 79 ++++++++++++++++++++++++++-----------
 2 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/src/mesa/pipe/failover/fo_context.c b/src/mesa/pipe/failover/fo_context.c
index cf6c9fed50..7ce4a7df17 100644
--- a/src/mesa/pipe/failover/fo_context.c
+++ b/src/mesa/pipe/failover/fo_context.c
@@ -114,6 +114,8 @@ struct pipe_context *failover_create( struct pipe_context *hw,
    if (failover == NULL)
       return NULL;
 
+   failover->hw = hw;
+   failover->sw = sw;
    failover->pipe.winsys = hw->winsys;
    failover->pipe.destroy = failover_destroy;
    failover->pipe.is_format_supported = hw->is_format_supported;
diff --git a/src/mesa/pipe/failover/fo_state.c b/src/mesa/pipe/failover/fo_state.c
index fa700b9674..0fc5568da1 100644
--- a/src/mesa/pipe/failover/fo_state.c
+++ b/src/mesa/pipe/failover/fo_state.c
@@ -54,8 +54,8 @@ failover_create_blend_state( struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_blend_state(pipe, blend);
-   state->hw_state = failover->hw->create_blend_state(pipe, blend);
+   state->sw_state = failover->sw->create_blend_state(failover->sw, blend);
+   state->hw_state = failover->hw->create_blend_state(failover->hw, blend);
 
    return state;
 }
@@ -68,6 +68,7 @@ failover_bind_blend_state( struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state *)blend;
    failover->blend = state;
    failover->dirty |= FO_NEW_BLEND;
+   failover->sw->bind_blend_state( failover->sw, state->sw_state );
    failover->hw->bind_blend_state( failover->hw, state->hw_state );
 }
 
@@ -78,8 +79,8 @@ failover_delete_blend_state( struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)blend;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_blend_state(pipe, state->sw_state);
-   failover->hw->delete_blend_state(pipe, state->hw_state);
+   failover->sw->delete_blend_state(failover->sw, state->sw_state);
+   failover->hw->delete_blend_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -93,6 +94,7 @@ failover_set_blend_color( struct pipe_context *pipe,
 
    failover->blend_color = *blend_color;
    failover->dirty |= FO_NEW_BLEND_COLOR;
+   failover->sw->set_blend_color( failover->sw, blend_color );
    failover->hw->set_blend_color( failover->hw, blend_color );
 }
 
@@ -104,6 +106,7 @@ failover_set_clip_state( struct pipe_context *pipe,
 
    failover->clip = *clip;
    failover->dirty |= FO_NEW_CLIP;
+   failover->sw->set_clip_state( failover->sw, clip );
    failover->hw->set_clip_state( failover->hw, clip );
 }
 
@@ -115,8 +118,8 @@ failover_create_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_depth_stencil_alpha_state(pipe, templ);
-   state->hw_state = failover->hw->create_depth_stencil_alpha_state(pipe, templ);
+   state->sw_state = failover->sw->create_depth_stencil_alpha_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_depth_stencil_alpha_state(failover->hw, templ);
 
    return state;
 }
@@ -129,6 +132,7 @@ failover_bind_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state *)depth_stencil;
    failover->depth_stencil = state;
    failover->dirty |= FO_NEW_DEPTH_STENCIL;
+   failover->sw->bind_depth_stencil_alpha_state(failover->sw, state->sw_state);
    failover->hw->bind_depth_stencil_alpha_state(failover->hw, state->hw_state);
 }
 
@@ -139,8 +143,8 @@ failover_delete_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)ds;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_depth_stencil_alpha_state(pipe, state->sw_state);
-   failover->hw->delete_depth_stencil_alpha_state(pipe, state->hw_state);
+   failover->sw->delete_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->delete_depth_stencil_alpha_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -154,6 +158,7 @@ failover_set_framebuffer_state(struct pipe_context *pipe,
 
    failover->framebuffer = *framebuffer;
    failover->dirty |= FO_NEW_FRAMEBUFFER;
+   failover->sw->set_framebuffer_state( failover->sw, framebuffer );
    failover->hw->set_framebuffer_state( failover->hw, framebuffer );
 }
 
@@ -165,8 +170,8 @@ failover_create_fs_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_fs_state(pipe, templ);
-   state->hw_state = failover->hw->create_fs_state(pipe, templ);
+   state->sw_state = failover->sw->create_fs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_fs_state(failover->hw, templ);
 
    return state;
 }
@@ -178,6 +183,7 @@ failover_bind_fs_state(struct pipe_context *pipe, void *fs)
    struct fo_state *state = (struct fo_state*)fs;
    failover->fragment_shader = state;
    failover->dirty |= FO_NEW_FRAGMENT_SHADER;
+   failover->sw->bind_fs_state(failover->sw, state->sw_state);
    failover->hw->bind_fs_state(failover->hw, state->hw_state);
 }
 
@@ -188,8 +194,8 @@ failover_delete_fs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)fs;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_fs_state(pipe, state->sw_state);
-   failover->hw->delete_fs_state(pipe, state->hw_state);
+   failover->sw->delete_fs_state(failover->sw, state->sw_state);
+   failover->hw->delete_fs_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -202,8 +208,8 @@ failover_create_vs_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_vs_state(pipe, templ);
-   state->hw_state = failover->hw->create_vs_state(pipe, templ);
+   state->sw_state = failover->sw->create_vs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_vs_state(failover->hw, templ);
 
    return state;
 }
@@ -217,6 +223,7 @@ failover_bind_vs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)vs;
    failover->vertex_shader = state;
    failover->dirty |= FO_NEW_VERTEX_SHADER;
+   failover->sw->bind_vs_state(failover->sw, state->sw_state);
    failover->hw->bind_vs_state(failover->hw, state->hw_state);
 }
 
@@ -227,8 +234,8 @@ failover_delete_vs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)vs;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_vs_state(pipe, state->sw_state);
-   failover->hw->delete_vs_state(pipe, state->hw_state);
+   failover->sw->delete_vs_state(failover->sw, state->sw_state);
+   failover->hw->delete_vs_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -242,6 +249,7 @@ failover_set_polygon_stipple( struct pipe_context *pipe,
 
    failover->poly_stipple = *stipple;
    failover->dirty |= FO_NEW_STIPPLE;
+   failover->sw->set_polygon_stipple( failover->sw, stipple );
    failover->hw->set_polygon_stipple( failover->hw, stipple );
 }
 
@@ -253,8 +261,8 @@ failover_create_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_rasterizer_state(pipe, templ);
-   state->hw_state = failover->hw->create_rasterizer_state(pipe, templ);
+   state->sw_state = failover->sw->create_rasterizer_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_rasterizer_state(failover->hw, templ);
 
    return state;
 }
@@ -268,6 +276,7 @@ failover_bind_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)raster;
    failover->rasterizer = state;
    failover->dirty |= FO_NEW_RASTERIZER;
+   failover->sw->bind_rasterizer_state(failover->sw, state->sw_state);
    failover->hw->bind_rasterizer_state(failover->hw, state->hw_state);
 }
 
@@ -278,8 +287,8 @@ failover_delete_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)raster;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_rasterizer_state(pipe, state->sw_state);
-   failover->hw->delete_rasterizer_state(pipe, state->hw_state);
+   failover->sw->delete_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->delete_rasterizer_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -294,6 +303,7 @@ failover_set_scissor_state( struct pipe_context *pipe,
 
    failover->scissor = *scissor;
    failover->dirty |= FO_NEW_SCISSOR;
+   failover->sw->set_scissor_state( failover->sw, scissor );
    failover->hw->set_scissor_state( failover->hw, scissor );
 }
 
@@ -305,8 +315,8 @@ failover_create_sampler_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_sampler_state(pipe, templ);
-   state->hw_state = failover->hw->create_sampler_state(pipe, templ);
+   state->sw_state = failover->sw->create_sampler_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_sampler_state(failover->hw, templ);
 
    return state;
 }
@@ -320,6 +330,8 @@ failover_bind_sampler_state(struct pipe_context *pipe,
    failover->sampler[unit] = state;
    failover->dirty |= FO_NEW_SAMPLER;
    failover->dirty_sampler |= (1<<unit);
+   failover->sw->bind_sampler_state(failover->sw, unit,
+                                    state->sw_state);
    failover->hw->bind_sampler_state(failover->hw, unit,
                                     state->hw_state);
 }
@@ -330,8 +342,8 @@ failover_delete_sampler_state(struct pipe_context *pipe, void *sampler)
    struct fo_state *state = (struct fo_state*)sampler;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_sampler_state(pipe, state->sw_state);
-   failover->hw->delete_sampler_state(pipe, state->hw_state);
+   failover->sw->delete_sampler_state(failover->sw, state->sw_state);
+   failover->hw->delete_sampler_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -348,6 +360,7 @@ failover_set_sampler_texture(struct pipe_context *pipe,
    failover->texture[unit] = texture;
    failover->dirty |= FO_NEW_TEXTURE;
    failover->dirty_texture |= (1<<unit);
+   failover->sw->set_sampler_texture( failover->sw, unit, texture );
    failover->hw->set_sampler_texture( failover->hw, unit, texture );
 }
 
@@ -360,6 +373,7 @@ failover_set_viewport_state( struct pipe_context *pipe,
 
    failover->viewport = *viewport; 
    failover->dirty |= FO_NEW_VIEWPORT;
+   failover->sw->set_viewport_state( failover->sw, viewport );
    failover->hw->set_viewport_state( failover->hw, viewport );
 }
 
@@ -374,6 +388,7 @@ failover_set_vertex_buffer(struct pipe_context *pipe,
    failover->vertex_buffer[unit] = *vertex_buffer;
    failover->dirty |= FO_NEW_VERTEX_BUFFER;
    failover->dirty_vertex_buffer |= (1<<unit);
+   failover->sw->set_vertex_buffer( failover->sw, unit, vertex_buffer );
    failover->hw->set_vertex_buffer( failover->hw, unit, vertex_buffer );
 }
 
@@ -388,9 +403,24 @@ failover_set_vertex_element(struct pipe_context *pipe,
    failover->vertex_element[unit] = *vertex_element;
    failover->dirty |= FO_NEW_VERTEX_ELEMENT;
    failover->dirty_vertex_element |= (1<<unit);
+   failover->sw->set_vertex_element( failover->sw, unit, vertex_element );
    failover->hw->set_vertex_element( failover->hw, unit, vertex_element );
 }
 
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   failover->sw->set_constant_buffer(failover->sw, shader, index, buf);
+   failover->hw->set_constant_buffer(failover->hw, shader, index, buf);
+}
+
 
 void
 failover_init_state_functions( struct failover_context *failover )
@@ -423,4 +453,5 @@ failover_init_state_functions( struct failover_context *failover )
    failover->pipe.set_viewport_state = failover_set_viewport_state;
    failover->pipe.set_vertex_buffer = failover_set_vertex_buffer;
    failover->pipe.set_vertex_element = failover_set_vertex_element;
+   failover->pipe.set_constant_buffer = failover_set_constant_buffer;
 }
-- 
cgit v1.2.3


From e770d6adeb710fcd16ea6af9764121b8933315c0 Mon Sep 17 00:00:00 2001
From: Jerome Glisse <glisse@freedesktop.org>
Date: Fri, 8 Feb 2008 18:47:25 +0100
Subject: intel_winsys: remove leftover code

---
 src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c b/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
index 910c0d2cc5..789a386500 100644
--- a/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
+++ b/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
@@ -224,11 +224,6 @@ intel_i915_surface_alloc_storage(struct pipe_winsys *winsys,
    if(!surf->buffer)
       return -1;
 
-   if(ret) {
-      pipe_buffer_reference(winsys, &surf->buffer, NULL);
-      return ret;
-   }
-   
    return 0;
 }
 
-- 
cgit v1.2.3


From 76dc41765f97b7550d691069fb53e699d5b07d95 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:45:58 -0700
Subject: Remove unused texunit parameter to ctx->Driver.GenerateMipmap()

---
 src/mesa/main/dd.h       |  3 +--
 src/mesa/main/fbobject.c |  2 +-
 src/mesa/main/mipmap.c   |  1 -
 src/mesa/main/mipmap.h   |  1 -
 src/mesa/main/texstore.c | 32 ++++++++------------------------
 5 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index c2ef67ba6d..37ef2a865b 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -335,8 +335,7 @@ struct dd_function_table {
    /**
     * Called by glGenerateMipmap() or when GL_GENERATE_MIPMAP_SGIS is enabled.
     */
-   void (*GenerateMipmap)(GLcontext *ctx,  GLenum target,
-                          const struct gl_texture_unit *texUnit,
+   void (*GenerateMipmap)(GLcontext *ctx, GLenum target,
                           struct gl_texture_object *texObj);
 
    /**
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 13cbd35424..6a8cba4d8a 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1560,7 +1560,7 @@ _mesa_GenerateMipmapEXT(GLenum target)
 
    /* XXX this might not handle cube maps correctly */
    _mesa_lock_texture(ctx, texObj);
-   ctx->Driver.GenerateMipmap(ctx, target, texUnit, texObj);
+   ctx->Driver.GenerateMipmap(ctx, target, texObj);
    _mesa_unlock_texture(ctx, texObj);
 }
 
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 9f3db22b75..1e61829e8f 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -933,7 +933,6 @@ make_2d_stack_mipmap(const struct gl_texture_format *format, GLint border,
  */
 void
 _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
-                      const struct gl_texture_unit *texUnit,
                       struct gl_texture_object *texObj)
 {
    const struct gl_texture_image *srcImage;
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index df78603283..46e16902c8 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -30,7 +30,6 @@
 
 extern void
 _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
-                      const struct gl_texture_unit *texUnit,
                       struct gl_texture_object *texObj);
 
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 26ca4f1bd5..a6a18910fc 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -2917,9 +2917,7 @@ _mesa_store_teximage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3003,9 +3001,7 @@ _mesa_store_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3079,9 +3075,7 @@ _mesa_store_teximage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3127,9 +3121,7 @@ _mesa_store_texsubimage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3182,9 +3174,7 @@ _mesa_store_texsubimage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3237,9 +3227,7 @@ _mesa_store_texsubimage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3313,9 +3301,7 @@ _mesa_store_compressed_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
@@ -3425,9 +3411,7 @@ _mesa_store_compressed_texsubimage2d(GLcontext *ctx, GLenum target,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
-- 
cgit v1.2.3


From 184054fea12e4301c1ccc4cbe13594fe84f0ed78 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:46:47 -0700
Subject: gallium: added draw_flush() call in softpipe_bind_sampler_state()

---
 src/mesa/pipe/softpipe/sp_state_sampler.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/pipe/softpipe/sp_state_sampler.c b/src/mesa/pipe/softpipe/sp_state_sampler.c
index 51b4b78287..291bbc40ad 100644
--- a/src/mesa/pipe/softpipe/sp_state_sampler.c
+++ b/src/mesa/pipe/softpipe/sp_state_sampler.c
@@ -49,6 +49,8 @@ softpipe_bind_sampler_state(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    softpipe->sampler[unit] = (struct pipe_sampler_state *)sampler;
 
-- 
cgit v1.2.3


From d68ea8114abdf128907618e86c0077ad719a9920 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:51:32 -0700
Subject: gallium: added inClipCoords param to st_draw_vertices() to indicate
 coord system of vertices

Also, export st_make_passthrough_vertex_shader() from st_cb_drawpixels.c
---
 src/mesa/state_tracker/st_cb_clear.c      |  2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c | 18 +++++++++---------
 src/mesa/state_tracker/st_cb_drawpixels.h |  4 ++++
 src/mesa/state_tracker/st_draw.c          | 21 ++++++++++++---------
 src/mesa/state_tracker/st_draw.h          |  3 ++-
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 0cd469c156..ab98b54bab 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -251,7 +251,7 @@ draw_quad(GLcontext *ctx,
       verts[i][1][3] = color[3];
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_FALSE);
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 34d420fcff..07886e7982 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -355,8 +355,8 @@ make_fragment_shader_z(struct st_context *st)
  * Create a simple vertex shader that just passes through the
  * vertex position and texcoord (and optionally, color).
  */
-static struct st_vertex_program *
-make_vertex_shader(struct st_context *st, GLboolean passColor)
+struct st_vertex_program *
+st_make_passthrough_vertex_shader(struct st_context *st, GLboolean passColor)
 {
    /* only make programs once and re-use */
    static struct st_vertex_program *progs[2] = { NULL, NULL };
@@ -572,7 +572,7 @@ draw_quad(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
       verts[i][1][3] = 1.0; /*Q*/
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_FALSE);
 }
 
 
@@ -625,7 +625,7 @@ draw_quad_colored(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
       verts[i][2][3] = 1.0; /*Q*/
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 3);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 3, GL_FALSE);
 }
 
 
@@ -945,7 +945,7 @@ st_DrawPixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    if (format == GL_DEPTH_COMPONENT) {
       ps = st->state.framebuffer.zsbuf;
       stfp = make_fragment_shader_z(ctx->st);
-      stvp = make_vertex_shader(ctx->st, GL_TRUE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
       color = ctx->Current.RasterColor;
    }
    else if (format == GL_STENCIL_INDEX) {
@@ -956,7 +956,7 @@ st_DrawPixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    else {
       ps = st->state.framebuffer.cbufs[0];
       stfp = combined_drawpix_fragment_program(ctx);
-      stvp = make_vertex_shader(ctx->st, GL_FALSE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_FALSE);
       color = NULL;
    }
 
@@ -1111,7 +1111,7 @@ st_Bitmap(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    struct st_context *st = ctx->st;
    struct pipe_texture *pt;
 
-   stvp = make_vertex_shader(ctx->st, GL_TRUE);
+   stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
    stfp = combined_bitmap_fragment_program(ctx);
 
    st_validate_state(st);
@@ -1229,13 +1229,13 @@ st_CopyPixels(GLcontext *ctx, GLint srcx, GLint srcy,
       rbRead = st_renderbuffer(ctx->ReadBuffer->_ColorReadBuffer);
       color = NULL;
       stfp = combined_drawpix_fragment_program(ctx);
-      stvp = make_vertex_shader(ctx->st, GL_FALSE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_FALSE);
    }
    else {
       rbRead = st_renderbuffer(ctx->ReadBuffer->_DepthBuffer);
       color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
       stfp = make_fragment_shader_z(ctx->st);
-      stvp = make_vertex_shader(ctx->st, GL_TRUE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
    }
 
    psRead = rbRead->surface;
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h
index 71ba487020..b8b906f06b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.h
+++ b/src/mesa/state_tracker/st_cb_drawpixels.h
@@ -30,6 +30,10 @@
 #define ST_CB_DRAWPIXELS_H
 
 
+extern struct st_vertex_program *
+st_make_passthrough_vertex_shader(struct st_context *st, GLboolean passColor);
+
+
 extern void st_init_drawpixels_functions(struct dd_function_table *functions);
 
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index c9b8e78485..ae9f5c8b11 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -354,7 +354,8 @@ st_draw_vbo(GLcontext *ctx,
 void 
 st_draw_vertices(GLcontext *ctx, unsigned prim,
                  unsigned numVertex, float *verts,
-                 unsigned numAttribs)
+                 unsigned numAttribs,
+                 GLboolean inClipCoords)
 {
    const float width = ctx->DrawBuffer->Width;
    const float height = ctx->DrawBuffer->Height;
@@ -367,14 +368,16 @@ st_draw_vertices(GLcontext *ctx, unsigned prim,
 
    assert(numAttribs > 0);
 
-   /* convert to clip coords */
-   for (i = 0; i < numVertex; i++) {
-      float x = verts[i * numAttribs * 4 + 0];
-      float y = verts[i * numAttribs * 4 + 1];
-      x = x / width * 2.0 - 1.0;
-      y = y / height * 2.0 - 1.0;
-      verts[i * numAttribs * 4 + 0] = x;
-      verts[i * numAttribs * 4 + 1] = y;
+   if (!inClipCoords) {
+      /* convert to clip coords */
+      for (i = 0; i < numVertex; i++) {
+         float x = verts[i * numAttribs * 4 + 0];
+         float y = verts[i * numAttribs * 4 + 1];
+         x = x / width * 2.0 - 1.0;
+         y = y / height * 2.0 - 1.0;
+         verts[i * numAttribs * 4 + 0] = x;
+         verts[i * numAttribs * 4 + 1] = y;
+      }
    }
 
    /* XXX create one-time */
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 89ee790c57..171bde57e5 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -62,7 +62,8 @@ st_feedback_draw_vbo(GLcontext *ctx,
 void 
 st_draw_vertices(GLcontext *ctx, unsigned prim,
                  unsigned numVertex, float *verts,
-                 unsigned numAttribs);
+                 unsigned numAttribs,
+                 GLboolean inClipCoords);
 
 
 #endif
-- 
cgit v1.2.3


From 44bb16c4d45584384f6fbbcc7207016421200891 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:54:18 -0700
Subject: gallium: initial implemenation of auto mipmap generation in state
 tracker

Use hardware rendering to compute/render mipmap levels.
The fallback path (which will be used for non-renderable texture formats)
isn't working yet.
---
 src/mesa/sources                       |   1 +
 src/mesa/state_tracker/st_cb_texture.c |  11 +-
 src/mesa/state_tracker/st_context.c    |   2 +
 src/mesa/state_tracker/st_gen_mipmap.c | 362 +++++++++++++++++++++++++++++++++
 src/mesa/state_tracker/st_gen_mipmap.h |  46 +++++
 5 files changed, 415 insertions(+), 7 deletions(-)
 create mode 100644 src/mesa/state_tracker/st_gen_mipmap.c
 create mode 100644 src/mesa/state_tracker/st_gen_mipmap.h

diff --git a/src/mesa/sources b/src/mesa/sources
index c0087f76e6..84492c91ac 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -234,6 +234,7 @@ STATETRACKER_SOURCES = \
 	state_tracker/st_extensions.c \
 	state_tracker/st_format.c \
 	state_tracker/st_framebuffer.c \
+	state_tracker/st_gen_mipmap.c \
 	state_tracker/st_mesa_to_tgsi.c \
 	state_tracker/st_program.c \
 	state_tracker/st_texture.c
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 8db4a804ab..3350254654 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -42,6 +42,7 @@
 #include "state_tracker/st_cb_texture.h"
 #include "state_tracker/st_format.h"
 #include "state_tracker/st_texture.h"
+#include "state_tracker/st_gen_mipmap.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -703,13 +704,9 @@ st_TexImage(GLcontext * ctx,
       texImage->Data = NULL;
    }
 
-#if 0
-   /* GL_SGIS_generate_mipmap -- this can be accelerated now.
-    */
+#if 01
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 #endif
 }
@@ -1538,7 +1535,7 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage1D = st_CopyTexSubImage1D;
    functions->CopyTexSubImage2D = st_CopyTexSubImage2D;
    functions->CopyTexSubImage3D = st_CopyTexSubImage3D;
-   functions->GenerateMipmap = _mesa_generate_mipmap;
+   functions->GenerateMipmap = st_generate_mipmap;
 
    functions->GetTexImage = st_GetTexImage;
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 9c206c057a..bf4618bed8 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -49,6 +49,7 @@
 #include "st_atom.h"
 #include "st_draw.h"
 #include "st_extensions.h"
+#include "st_gen_mipmap.h"
 #include "st_program.h"
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
@@ -96,6 +97,7 @@ st_create_context_priv( GLcontext *ctx, struct pipe_context *pipe )
 
    st_init_atoms( st );
    st_init_draw( st );
+   st_init_generate_mipmap(st);
 
    /* we want all vertex data to be placed in buffer objects */
    vbo_use_buffer_objects(ctx);
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
new file mode 100644
index 0000000000..16f9e4cd27
--- /dev/null
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -0,0 +1,362 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/imports.h"
+#include "main/mipmap.h"
+#include "main/teximage.h"
+
+#include "shader/prog_instruction.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/cso_cache/cso_cache.h"
+
+#include "st_context.h"
+#include "st_draw.h"
+#include "st_gen_mipmap.h"
+#include "st_program.h"
+#include "st_cb_texture.h"
+
+
+
+static void *blend_cso = NULL;
+static void *depthstencil_cso = NULL;
+static void *rasterizer_cso = NULL;
+static void *sampler_cso = NULL;
+
+static struct st_fragment_program *stfp = NULL;
+static struct st_vertex_program *stvp = NULL;
+
+
+
+static struct st_fragment_program *
+make_tex_fragment_program(GLcontext *ctx)
+{
+   struct st_fragment_program *stfp;
+   struct gl_program *p;
+   GLuint ic = 0;
+
+   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
+   if (!p)
+      return NULL;
+
+   p->NumInstructions = 2;
+
+   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
+   if (!p->Instructions) {
+      ctx->Driver.DeleteProgram(ctx, p);
+      return NULL;
+   }
+   _mesa_init_instructions(p->Instructions, p->NumInstructions);
+
+   /* TEX result.color, fragment.texcoord[0], texture[0], 2D; */
+   p->Instructions[ic].Opcode = OPCODE_TEX;
+   p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
+   p->Instructions[ic].DstReg.Index = FRAG_RESULT_COLR;
+   p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
+   p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
+   p->Instructions[ic].TexSrcUnit = 0;
+   p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
+   ic++;
+
+   /* END; */
+   p->Instructions[ic++].Opcode = OPCODE_END;
+
+   assert(ic == p->NumInstructions);
+
+   p->InputsRead = FRAG_BIT_TEX0;
+   p->OutputsWritten = (1 << FRAG_RESULT_COLR);
+
+   stfp = (struct st_fragment_program *) p;
+
+   st_translate_fragment_program(ctx->st, stfp, NULL,
+                                 stfp->tokens, ST_MAX_SHADER_TOKENS);
+
+   return stfp;
+}
+
+
+
+
+/**
+ * one-time init for generate mipmap
+ * XXX Note: there may be other times we need no-op/simple state like this.
+ * In that case, some code refactoring would be good.
+ */
+void
+st_init_generate_mipmap(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_blend_state blend;
+   struct pipe_rasterizer_state rasterizer;
+   struct pipe_sampler_state sampler;
+   struct pipe_depth_stencil_alpha_state depthstencil;
+
+   assert(!blend_cso);
+
+   memset(&blend, 0, sizeof(blend));
+   blend.colormask = PIPE_MASK_RGBA;
+   blend_cso = pipe->create_blend_state(pipe, &blend);
+
+   memset(&depthstencil, 0, sizeof(depthstencil));
+   depthstencil_cso = pipe->create_depth_stencil_alpha_state(pipe, &depthstencil);
+
+   memset(&rasterizer, 0, sizeof(rasterizer));
+   rasterizer_cso = pipe->create_rasterizer_state(pipe, &rasterizer);
+
+   memset(&sampler, 0, sizeof(sampler));
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.normalized_coords = 1;
+   sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+
+   stfp = make_tex_fragment_program(st->ctx);
+   stvp = st_make_passthrough_vertex_shader(st, GL_FALSE);
+}
+
+
+void
+st_destroy_generate_mipmpap(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+
+   pipe->delete_blend_state(pipe, blend_cso);
+   pipe->delete_depth_stencil_alpha_state(pipe, depthstencil_cso);
+   pipe->delete_rasterizer_state(pipe, rasterizer_cso);
+   pipe->delete_sampler_state(pipe, sampler_cso);
+
+   /* XXX free stfp, stvp */
+
+   blend_cso = NULL;
+   depthstencil_cso = NULL;
+   rasterizer_cso = NULL;
+   sampler_cso = NULL;
+}
+
+
+static void
+simple_viewport(struct pipe_context *pipe, uint width, uint height)
+{
+   struct pipe_viewport_state vp;
+
+   vp.scale[0] =  0.5 * width;
+   vp.scale[1] = -0.5 * height;
+   vp.scale[2] = 1.0;
+   vp.scale[3] = 1.0;
+   vp.translate[0] = 0.5 * width;
+   vp.translate[1] = 0.5 * height;
+   vp.translate[2] = 0.0;
+   vp.translate[3] = 0.0;
+
+   pipe->set_viewport_state(pipe, &vp);
+}
+
+
+
+/*
+ * Draw simple [-1,1]x[-1,1] quad
+ */
+static void
+draw_quad(GLcontext *ctx)
+{
+   GLfloat verts[4][2][4]; /* four verts, two attribs, XYZW */
+   GLuint i;
+   GLfloat sLeft = 0.0, sRight = 1.0;
+   GLfloat tTop = 1.0, tBot = 0.0;
+   GLfloat x0 = -1.0, x1 = 1.0;
+   GLfloat y0 = -1.0, y1 = 1.0;
+
+   /* upper-left */
+   verts[0][0][0] = x0;    /* attr[0].x */
+   verts[0][0][1] = y0;    /* attr[0].y */
+   verts[0][1][0] = sLeft; /* attr[1].s */
+   verts[0][1][1] = tTop;  /* attr[1].t */
+
+   /* upper-right */
+   verts[1][0][0] = x1;
+   verts[1][0][1] = y0;
+   verts[1][1][0] = sRight;
+   verts[1][1][1] = tTop;
+
+   /* lower-right */
+   verts[2][0][0] = x1;
+   verts[2][0][1] = y1;
+   verts[2][1][0] = sRight;
+   verts[2][1][1] = tBot;
+
+   /* lower-left */
+   verts[3][0][0] = x0;
+   verts[3][0][1] = y1;
+   verts[3][1][0] = sLeft;
+   verts[3][1][1] = tBot;
+
+   /* same for all verts: */
+   for (i = 0; i < 4; i++) {
+      verts[i][0][2] = 0.0; /*Z*/
+      verts[i][0][3] = 1.0; /*W*/
+      verts[i][1][2] = 0.0; /*R*/
+      verts[i][1][3] = 1.0; /*Q*/
+   }
+
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_TRUE);
+}
+
+
+
+/**
+ * Generate mipmap levels using hardware rendering.
+ * \return TRUE if successful, FALSE if not possible
+ */
+static boolean
+st_render_mipmap(struct st_context *st,
+                 struct pipe_texture *pt,
+                 uint baseLevel, uint lastLevel)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_framebuffer_state fb;
+   const uint face = 0, zslice = 0;
+   const uint first_level_save = pt->first_level;
+   uint dstLevel;
+
+   /* check if we can render in the texture's format */
+   if (!pipe->is_format_supported(pipe, pt->format, PIPE_SURFACE)) {
+      return FALSE;
+   }
+
+   /* init framebuffer state */
+   memset(&fb, 0, sizeof(fb));
+   fb.num_cbufs = 1;
+
+   /* bind CSOs */
+   pipe->bind_blend_state(pipe, blend_cso);
+   pipe->bind_depth_stencil_alpha_state(pipe, depthstencil_cso);
+   pipe->bind_rasterizer_state(pipe, rasterizer_cso);
+   pipe->bind_sampler_state(pipe, 0, sampler_cso);
+
+   /* bind shaders */
+   pipe->bind_fs_state(pipe, stfp->fs->data);
+   pipe->bind_vs_state(pipe, stvp->cso->data);
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+
+      /*
+       * Setup framebuffer / dest surface
+       */
+      fb.cbufs[0] = pipe->get_tex_surface(pipe, pt, face, dstLevel, zslice);
+      pipe->set_framebuffer_state(pipe, &fb);
+
+      simple_viewport(pipe, pt->width[dstLevel], pt->height[dstLevel]);
+
+      /*
+       * Setup src texture, override pt->first_level so we sample from
+       * the right mipmap level.
+       */
+      pt->first_level = srcLevel;
+      pipe->set_sampler_texture(pipe, 0, pt);
+
+      draw_quad(st->ctx);
+   }
+
+   /* restore first_level */
+   pt->first_level = first_level_save;
+
+   /* restore pipe state */
+   if (st->state.rasterizer)
+      pipe->bind_rasterizer_state(pipe, st->state.rasterizer->data);
+   if (st->state.fs)
+      pipe->bind_fs_state(pipe, st->state.fs->data);
+   if (st->state.vs)
+      pipe->bind_vs_state(pipe, st->state.vs->cso->data);
+   if (st->state.sampler[0])
+      pipe->bind_sampler_state(pipe, 0, st->state.sampler[0]->data);
+   pipe->set_sampler_texture(pipe, 0, st->state.sampler_texture[0]);
+   pipe->set_viewport_state(pipe, &st->state.viewport);
+
+   return TRUE;
+}
+
+
+
+void
+st_generate_mipmap(GLcontext *ctx, GLenum target,
+                   struct gl_texture_object *texObj)
+{
+   struct st_context *st = ctx->st;
+   struct pipe_texture *pt = st_get_texobj_texture(texObj);
+   const uint baseLevel = texObj->BaseLevel;
+   const uint lastLevel = pt->last_level;
+   uint dstLevel;
+
+   if (!st_render_mipmap(st, pt, baseLevel, lastLevel)) {
+      abort();
+      /* XXX the following won't really work at this time */
+      _mesa_generate_mipmap(ctx, target, texObj);
+      return;
+   }
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      const struct gl_texture_image *srcImage
+         = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
+      struct gl_texture_image *dstImage;
+      struct st_texture_image *stImage;
+      uint dstWidth = pt->width[dstLevel];
+      uint dstHeight = pt->height[dstLevel];
+      uint dstDepth = pt->depth[dstLevel];
+      uint border = srcImage->Border;
+
+
+      dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
+      if (!dstImage) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
+         return;
+      }
+
+      if (dstImage->ImageOffsets)
+         _mesa_free(dstImage->ImageOffsets);
+
+      /* Free old image data */
+      if (dstImage->Data)
+         ctx->Driver.FreeTexImageData(ctx, dstImage);
+
+      /* initialize new image */
+      _mesa_init_teximage_fields(ctx, target, dstImage, dstWidth, dstHeight,
+                                 dstDepth, border, srcImage->InternalFormat);
+
+      dstImage->TexFormat = srcImage->TexFormat;
+
+      stImage = (struct st_texture_image *) dstImage;
+      stImage->pt = pt;
+   }
+
+}
diff --git a/src/mesa/state_tracker/st_gen_mipmap.h b/src/mesa/state_tracker/st_gen_mipmap.h
new file mode 100644
index 0000000000..7668c1e44e
--- /dev/null
+++ b/src/mesa/state_tracker/st_gen_mipmap.h
@@ -0,0 +1,46 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef ST_GEN_MIPMAP_H
+#define ST_GEN_MIPMAP_H
+
+
+extern void
+st_init_generate_mipmap(struct st_context *st);
+
+
+extern void
+st_destroy_generate_mipmpap(struct st_context *st);
+
+
+extern void
+st_generate_mipmap(GLcontext *ctx, GLenum target,
+                   struct gl_texture_object *texObj);
+
+
+#endif /* ST_GEN_MIPMAP_H */
-- 
cgit v1.2.3


From fa0a651a3e849908a020b40f723ed347b2054dbf Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:55:33 -0700
Subject: fix comment typos

---
 src/mesa/pipe/tgsi/exec/tgsi_exec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
index 463ff0d9da..336ae1c8b6 100644
--- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
@@ -2010,7 +2010,7 @@ exec_instruction(
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
-      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[0] = texcoord (src[0].w = LOD bias) */
       /* src[1] = sampler unit */
       exec_tex(mach, inst, TRUE);
       break;
@@ -2026,7 +2026,7 @@ exec_instruction(
 
    case TGSI_OPCODE_TXL:
       /* Texture lookup with explit LOD */
-      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[0] = texcoord (src[0].w = LOD) */
       /* src[1] = sampler unit */
       exec_tex(mach, inst, TRUE);
       break;
-- 
cgit v1.2.3


From fc65fb54eec6562b158e38f9fc426b49174ba912 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:56:38 -0700
Subject: gallium: include st_cb_drawpixels.h

---
 src/mesa/state_tracker/st_gen_mipmap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index 16f9e4cd27..a6ac9a55fb 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -40,6 +40,7 @@
 #include "st_draw.h"
 #include "st_gen_mipmap.h"
 #include "st_program.h"
+#include "st_cb_drawpixels.h"
 #include "st_cb_texture.h"
 
 
-- 
cgit v1.2.3


From 6aad1d9bbc2dd77b600c60e471da3f6e392c09ab Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sat, 9 Feb 2008 14:08:54 +1100
Subject: nv40: delay all state emit until before draw

---
 src/mesa/pipe/nouveau/nouveau_stateobj.h |  2 +-
 src/mesa/pipe/nv40/nv40_context.h        | 22 +++++++++++++++++++---
 src/mesa/pipe/nv40/nv40_state.c          | 26 +++++++++++++++++---------
 src/mesa/pipe/nv40/nv40_state_emit.c     | 25 +++++++++++++++++++++++++
 src/mesa/pipe/nv40/nv40_vbo.c            |  5 ++++-
 5 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/src/mesa/pipe/nouveau/nouveau_stateobj.h b/src/mesa/pipe/nouveau/nouveau_stateobj.h
index 8dfc0e9e9a..58167a24de 100644
--- a/src/mesa/pipe/nouveau/nouveau_stateobj.h
+++ b/src/mesa/pipe/nouveau/nouveau_stateobj.h
@@ -30,7 +30,7 @@ so_new(unsigned push, unsigned reloc)
 	struct nouveau_stateobj *so;
 
 	so = malloc(sizeof(struct nouveau_stateobj));
-	so->refcount = 0;
+	so->refcount = 1;
 	so->push = malloc(sizeof(unsigned) * push);
 	so->reloc = malloc(sizeof(struct nouveau_stateobj_reloc) * reloc);
 
diff --git a/src/mesa/pipe/nv40/nv40_context.h b/src/mesa/pipe/nv40/nv40_context.h
index 1a31f00ad6..4aa34847e8 100644
--- a/src/mesa/pipe/nv40/nv40_context.h
+++ b/src/mesa/pipe/nv40/nv40_context.h
@@ -22,9 +22,18 @@
 #define NOUVEAU_MSG(fmt, args...) \
 	fprintf(stderr, "nouveau: "fmt, ##args);
 
-#define NV40_NEW_VERTPROG	(1 << 1)
-#define NV40_NEW_FRAGPROG	(1 << 2)
-#define NV40_NEW_ARRAYS		(1 << 3)
+#define NV40_NEW_BLEND		(1 <<  0)
+#define NV40_NEW_RAST		(1 <<  1)
+#define NV40_NEW_ZSA		(1 <<  2)
+#define NV40_NEW_SAMPLER	(1 <<  3)
+#define NV40_NEW_FB		(1 <<  4)
+#define NV40_NEW_STIPPLE	(1 <<  5)
+#define NV40_NEW_SCISSOR	(1 <<  6)
+#define NV40_NEW_VIEWPORT	(1 <<  7)
+#define NV40_NEW_BCOL		(1 <<  8)
+#define NV40_NEW_VERTPROG	(1 <<  9)
+#define NV40_NEW_FRAGPROG	(1 << 10)
+#define NV40_NEW_ARRAYS		(1 << 11)
 
 struct nv40_context {
 	struct pipe_context pipe;
@@ -51,6 +60,13 @@ struct nv40_context {
 	struct nouveau_stateobj *so_framebuffer;
 	struct nouveau_stateobj *so_fragtex[16];
 	struct nouveau_stateobj *so_vtxbuf;
+	struct nouveau_stateobj *so_blend;
+	struct nouveau_stateobj *so_rast;
+	struct nouveau_stateobj *so_zsa;
+	struct nouveau_stateobj *so_bcol;
+	struct nouveau_stateobj *so_scissor;
+	struct nouveau_stateobj *so_viewport;
+	struct nouveau_stateobj *so_stipple;
 
 	struct {
 		struct nouveau_resource *exec_heap;
diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c
index 125134afdc..ab53b03cb0 100644
--- a/src/mesa/pipe/nv40/nv40_state.c
+++ b/src/mesa/pipe/nv40/nv40_state.c
@@ -53,7 +53,8 @@ nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
-	so_emit(nv40->nvws, hwcso);
+	so_ref(hwcso, &nv40->so_blend);
+	nv40->dirty |= NV40_NEW_BLEND;
 }
 
 static void
@@ -354,7 +355,8 @@ nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
-	so_emit(nv40->nvws, hwcso);
+	so_ref(hwcso, &nv40->so_rast);
+	nv40->dirty |= NV40_NEW_RAST;
 }
 
 static void
@@ -420,7 +422,8 @@ nv40_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
-	so_emit(nv40->nvws, hwcso);
+	so_ref(hwcso, &nv40->so_zsa);
+	nv40->dirty |= NV40_NEW_ZSA;
 }
 
 static void
@@ -508,8 +511,9 @@ nv40_set_blend_color(struct pipe_context *pipe,
 		       (float_to_ubyte(bcol->color[1]) <<  8) |
 		       (float_to_ubyte(bcol->color[2]) <<  0)));
 
-	so_emit(nv40->nvws, so);
+	so_ref(so, &nv40->so_bcol);
 	so_ref(NULL, &so);
+	nv40->dirty |= NV40_NEW_BCOL;
 }
 
 static void
@@ -677,8 +681,9 @@ nv40_set_framebuffer_state(struct pipe_context *pipe,
 	so_data  (so, ((w - 1) << 16) | 0);
 	so_data  (so, ((h - 1) << 16) | 0);
 
-	so_emit(nv40->nvws, so);
-	so_ref (so, &nv40->so_framebuffer);
+	so_ref(so, &nv40->so_framebuffer);
+	so_ref(NULL, &so);
+	nv40->dirty |= NV40_NEW_FB;
 }
 
 static void
@@ -693,8 +698,9 @@ nv40_set_polygon_stipple(struct pipe_context *pipe,
 	for (i = 0; i < 32; i++)
 		so_data(so, stipple->stipple[i]);
 
-	so_emit(nv40->nvws, so);
+	so_ref(so, &nv40->so_stipple);
 	so_ref(NULL, &so);
+	nv40->dirty |= NV40_NEW_STIPPLE;
 }
 
 static void
@@ -708,8 +714,9 @@ nv40_set_scissor_state(struct pipe_context *pipe,
 	so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
 	so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
 
-	so_emit(nv40->nvws, so);
+	so_ref(so, &nv40->so_scissor);
 	so_ref(NULL, &so);
+	nv40->dirty |= NV40_NEW_SCISSOR;
 }
 
 static void
@@ -729,8 +736,9 @@ nv40_set_viewport_state(struct pipe_context *pipe,
 	so_data  (so, fui(vpt->scale[2]));
 	so_data  (so, fui(vpt->scale[3]));
 
-	so_emit(nv40->nvws, so);
+	so_ref(so, &nv40->so_viewport);
 	so_ref(NULL, &so);
+	nv40->dirty |= NV40_NEW_VIEWPORT;
 }
 
 static void
diff --git a/src/mesa/pipe/nv40/nv40_state_emit.c b/src/mesa/pipe/nv40/nv40_state_emit.c
index 3a22cd4bd5..a10c995548 100644
--- a/src/mesa/pipe/nv40/nv40_state_emit.c
+++ b/src/mesa/pipe/nv40/nv40_state_emit.c
@@ -25,6 +25,30 @@ nv40_state_emit_dummy_relocs(struct nv40_context *nv40)
 void
 nv40_emit_hw_state(struct nv40_context *nv40)
 {
+	if (nv40->dirty & NV40_NEW_FB)
+		so_emit(nv40->nvws, nv40->so_framebuffer);
+
+	if (nv40->dirty & NV40_NEW_BLEND)
+		so_emit(nv40->nvws, nv40->so_blend);
+
+	if (nv40->dirty & NV40_NEW_RAST)
+		so_emit(nv40->nvws, nv40->so_rast);
+
+	if (nv40->dirty & NV40_NEW_ZSA)
+		so_emit(nv40->nvws, nv40->so_zsa);
+
+	if (nv40->dirty & NV40_NEW_BCOL)
+		so_emit(nv40->nvws, nv40->so_bcol);
+
+	if (nv40->dirty & NV40_NEW_SCISSOR)
+		so_emit(nv40->nvws, nv40->so_scissor);
+
+	if (nv40->dirty & NV40_NEW_VIEWPORT)
+		so_emit(nv40->nvws, nv40->so_viewport);
+
+	if (nv40->dirty & NV40_NEW_STIPPLE)
+		so_emit(nv40->nvws, nv40->so_stipple);
+
 	if (nv40->dirty & NV40_NEW_FRAGPROG) {
 		nv40_fragprog_bind(nv40, nv40->fragprog.current);
 		/*XXX: clear NV40_NEW_FRAGPROG if no new program uploaded */
@@ -46,6 +70,7 @@ nv40_emit_hw_state(struct nv40_context *nv40)
 	}
 
 	nv40->dirty_samplers = 0;
+	nv40->dirty = 0;
 
 	nv40_state_emit_dummy_relocs(nv40);
 }
diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
index e2cb3fda8f..fd1d884193 100644
--- a/src/mesa/pipe/nv40/nv40_vbo.c
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -158,6 +158,7 @@ nv40_vbo_arrays_update(struct nv40_context *nv40, struct pipe_buffer *ib,
 	so_emit(nv40->nvws, vtxfmt);
 	so_emit(nv40->nvws, vtxbuf);
 	so_ref (vtxbuf, &nv40->so_vtxbuf);
+	so_ref (NULL, &vtxbuf);
 	so_ref (NULL, &vtxfmt);
 }
 
@@ -165,8 +166,10 @@ static boolean
 nv40_vbo_validate_state(struct nv40_context *nv40,
 			struct pipe_buffer *ib, unsigned ib_format)
 {
+	unsigned vdn = nv40->dirty & NV40_NEW_ARRAYS;
+
 	nv40_emit_hw_state(nv40);
-	if (nv40->dirty & NV40_NEW_ARRAYS || ib) {
+	if (vdn || ib) {
 		nv40_vbo_arrays_update(nv40, ib, ib_format);
 		nv40->dirty &= ~NV40_NEW_ARRAYS;
 	}
-- 
cgit v1.2.3


From fc38b21d2a27793f9473d0bb7f871d6a694e6923 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sat, 9 Feb 2008 16:25:29 +1100
Subject: nouveau: interface updates

---
 src/mesa/pipe/nouveau/nouveau_gldefs.h |  2 +-
 src/mesa/pipe/nv40/nv40_context.c      |  2 ++
 src/mesa/pipe/nv40/nv40_fragprog.c     |  1 +
 src/mesa/pipe/nv40/nv40_fragtex.c      |  1 +
 src/mesa/pipe/nv40/nv40_miptree.c      | 35 +++++++++++++++++-----------------
 src/mesa/pipe/nv50/nv50_miptree.c      |  5 +++--
 6 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/src/mesa/pipe/nouveau/nouveau_gldefs.h b/src/mesa/pipe/nouveau/nouveau_gldefs.h
index 8ba3bdef64..e1015c93a2 100644
--- a/src/mesa/pipe/nouveau/nouveau_gldefs.h
+++ b/src/mesa/pipe/nouveau/nouveau_gldefs.h
@@ -189,7 +189,7 @@ nvgl_primitive(unsigned prim) {
 	case PIPE_PRIM_POLYGON:
 		return 0x000a;
 	default:
-		assert(0);
+		return 0x0001;
 	}
 }
 
diff --git a/src/mesa/pipe/nv40/nv40_context.c b/src/mesa/pipe/nv40/nv40_context.c
index 1351a79fe0..49c67a8fd5 100644
--- a/src/mesa/pipe/nv40/nv40_context.c
+++ b/src/mesa/pipe/nv40/nv40_context.c
@@ -71,6 +71,8 @@ nv40_get_paramf(struct pipe_context *pipe, int param)
 		return 16.0;
 	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
 		return 4.0;
+	case PIPE_CAP_BITMAP_TEXCOORD_BIAS:
+		return 0.0;
 	default:
 		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
 		return 0.0;
diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
index e650c97541..1165661010 100644
--- a/src/mesa/pipe/nv40/nv40_fragprog.c
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -826,6 +826,7 @@ nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp)
 
 	so_emit(nv40->nvws, so);
 	so_ref(so, &fp->so);
+	so_ref(NULL, &so);
 
 	nv40->fragprog.active = fp;
 }
diff --git a/src/mesa/pipe/nv40/nv40_fragtex.c b/src/mesa/pipe/nv40/nv40_fragtex.c
index 283d49704a..c87e361831 100644
--- a/src/mesa/pipe/nv40/nv40_fragtex.c
+++ b/src/mesa/pipe/nv40/nv40_fragtex.c
@@ -121,6 +121,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 
 	so_emit(nv40->nvws, so);
 	so_ref (so, &nv40->so_fragtex[unit]);
+	so_ref (NULL, &so);
 }
 
 void
diff --git a/src/mesa/pipe/nv40/nv40_miptree.c b/src/mesa/pipe/nv40/nv40_miptree.c
index 48062d1ae8..df70feaa05 100644
--- a/src/mesa/pipe/nv40/nv40_miptree.c
+++ b/src/mesa/pipe/nv40/nv40_miptree.c
@@ -53,27 +53,26 @@ nv40_miptree_layout(struct nv40_miptree *nv40mt)
 	nv40mt->total_size = offset;
 }
 
-static void
-nv40_miptree_create(struct pipe_context *pipe, struct pipe_texture **pt)
+static struct pipe_texture *
+nv40_miptree_create(struct pipe_context *pipe, const struct pipe_texture *pt)
 {
 	struct pipe_winsys *ws = pipe->winsys;
-	struct nv40_miptree *nv40mt;
-
-	nv40mt = realloc(*pt, sizeof(struct nv40_miptree));
-	if (!nv40mt)
-		return;
-	*pt = NULL;
-
-	nv40_miptree_layout(nv40mt);
-
-	nv40mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
-					   nv40mt->total_size);
-	if (!nv40mt->buffer) {
-		free(nv40mt);
-		return;
+	struct nv40_miptree *mt;
+
+	mt = malloc(sizeof(struct nv40_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	nv40_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+				       mt->total_size);
+	if (!mt->buffer) {
+		free(mt);
+		return NULL;
 	}
-	
-	*pt = &nv40mt->base;
+
+	return &mt->base;
 }
 
 static void
diff --git a/src/mesa/pipe/nv50/nv50_miptree.c b/src/mesa/pipe/nv50/nv50_miptree.c
index 51442d64f3..0c034ed438 100644
--- a/src/mesa/pipe/nv50/nv50_miptree.c
+++ b/src/mesa/pipe/nv50/nv50_miptree.c
@@ -4,10 +4,11 @@
 
 #include "nv50_context.h"
 
-static void
-nv50_miptree_create(struct pipe_context *pipe, struct pipe_texture **pt)
+static struct pipe_texture *
+nv50_miptree_create(struct pipe_context *pipe, const struct pipe_texture *pt)
 {
 	NOUVEAU_ERR("unimplemented\n");
+	return NULL;
 }
 
 static void
-- 
cgit v1.2.3


From ae78e6b549c8c67c0997f79bf1fdfac7929df92a Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sun, 10 Feb 2008 15:16:50 +1100
Subject: nv40: prep for multiple pipe contexts on a single hw channel

---
 src/mesa/pipe/nv40/nv40_context.c  | 237 ++++++++++++++++++++-----------------
 src/mesa/pipe/nv40/nv40_context.h  |  29 +++--
 src/mesa/pipe/nv40/nv40_fragprog.c |   4 +-
 src/mesa/pipe/nv40/nv40_fragtex.c  |   4 +-
 src/mesa/pipe/nv40/nv40_query.c    |  11 +-
 src/mesa/pipe/nv40/nv40_state.c    |  83 ++++++-------
 src/mesa/pipe/nv40/nv40_vbo.c      |   8 +-
 src/mesa/pipe/nv40/nv40_vertprog.c |   4 +-
 8 files changed, 206 insertions(+), 174 deletions(-)

diff --git a/src/mesa/pipe/nv40/nv40_context.c b/src/mesa/pipe/nv40/nv40_context.c
index 49c67a8fd5..302ad04c15 100644
--- a/src/mesa/pipe/nv40/nv40_context.c
+++ b/src/mesa/pipe/nv40/nv40_context.c
@@ -5,6 +5,10 @@
 
 #include "nv40_context.h"
 
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
 static const char *
 nv40_get_name(struct pipe_context *pipe)
 {
@@ -93,7 +97,7 @@ nv40_flush(struct pipe_context *pipe, unsigned flags)
 	}
 
 	if (flags & PIPE_FLUSH_WAIT) {
-		nvws->notifier_reset(nv40->sync, 0);
+		nvws->notifier_reset(nv40->hw->sync, 0);
 		BEGIN_RING(curie, 0x104, 1);
 		OUT_RING  (0);
 		BEGIN_RING(curie, 0x100, 1);
@@ -103,104 +107,32 @@ nv40_flush(struct pipe_context *pipe, unsigned flags)
 	FIRE_RING();
 
 	if (flags & PIPE_FLUSH_WAIT)
-		nvws->notifier_wait(nv40->sync, 0, 0, 2000);
+		nvws->notifier_wait(nv40->hw->sync, 0, 0, 2000);
 }
 
 static void
-nv40_destroy(struct pipe_context *pipe)
+nv40_channel_takedown(struct nv40_channel_context *cnv40)
 {
-	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_winsys *nvws = nv40->nvws;
-
-	if (nv40->draw)
-		draw_destroy(nv40->draw);
-
-	nvws->res_free(&nv40->vertprog.exec_heap);
-	nvws->res_free(&nv40->vertprog.data_heap);
-
-	nvws->res_free(&nv40->query_heap);
-	nvws->notifier_free(&nv40->query);
-
-	nvws->notifier_free(&nv40->sync);
-
-	nvws->grobj_free(&nv40->curie);
-
-	free(nv40);
+	struct nouveau_winsys *nvws = cnv40->nvws;
+
+	nvws->res_free(&cnv40->vp_exec_heap);
+	nvws->res_free(&cnv40->vp_data_heap);
+	nvws->res_free(&cnv40->query_heap);
+	nvws->notifier_free(&cnv40->query);
+	nvws->notifier_free(&cnv40->sync);
+	nvws->grobj_free(&cnv40->curie);
+	free(cnv40);
 }
 
-static boolean
-nv40_init_hwctx(struct nv40_context *nv40, int curie_class)
+static struct nv40_channel_context *
+nv40_channel_init(struct pipe_winsys *ws, struct nouveau_winsys *nvws,
+		  unsigned chipset)
 {
-	struct nouveau_winsys *nvws = nv40->nvws;
+	struct nv40_channel_context *cnv40 = NULL;
+	struct nouveau_stateobj *so;
+	unsigned curie_class = 0;
 	int ret;
 
-	ret = nvws->grobj_alloc(nvws, curie_class, &nv40->curie);
-	if (ret) {
-		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
-		return FALSE;
-	}
-
-	BEGIN_RING(curie, NV40TCL_DMA_NOTIFY, 1);
-	OUT_RING  (nv40->sync->handle);
-	BEGIN_RING(curie, NV40TCL_DMA_TEXTURE0, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->gart->handle);
-	BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
-	OUT_RING  (nvws->channel->vram->handle);
-	BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->vram->handle);
-	BEGIN_RING(curie, NV40TCL_DMA_VTXBUF0, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->gart->handle);
-	BEGIN_RING(curie, NV40TCL_DMA_FENCE, 2);
-	OUT_RING  (0);
-	OUT_RING  (nv40->query->handle);
-	BEGIN_RING(curie, NV40TCL_DMA_UNK01AC, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->vram->handle);
-	BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 2);
-	OUT_RING  (nvws->channel->vram->handle);
-	OUT_RING  (nvws->channel->vram->handle);
-
-	BEGIN_RING(curie, 0x1ea4, 3);
-	OUT_RING  (0x00000010);
-	OUT_RING  (0x01000100);
-	OUT_RING  (0xff800006);
-
-	/* vtxprog output routing */
-	BEGIN_RING(curie, 0x1fc4, 1);
-	OUT_RING  (0x06144321);
-	BEGIN_RING(curie, 0x1fc8, 2);
-	OUT_RING  (0xedcba987);
-	OUT_RING  (0x00000021);
-	BEGIN_RING(curie, 0x1fd0, 1);
-	OUT_RING  (0x00171615);
-	BEGIN_RING(curie, 0x1fd4, 1);
-	OUT_RING  (0x001b1a19);
-
-	BEGIN_RING(curie, 0x1ef8, 1);
-	OUT_RING  (0x0020ffff);
-	BEGIN_RING(curie, 0x1d64, 1);
-	OUT_RING  (0x00d30000);
-	BEGIN_RING(curie, 0x1e94, 1);
-	OUT_RING  (0x00000001);
-
-	FIRE_RING ();
-	return TRUE;
-}
-
-#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
-#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
-#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
-
-struct pipe_context *
-nv40_create(struct pipe_winsys *pipe_winsys, struct nouveau_winsys *nvws,
-	    unsigned chipset)
-{
-	struct nv40_context *nv40;
-	int curie_class = 0, ret;
-
 	switch (chipset & 0xf0) {
 	case 0x40:
 		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f)))
@@ -218,65 +150,152 @@ nv40_create(struct pipe_winsys *pipe_winsys, struct nouveau_winsys *nvws,
 	}
 
 	if (!curie_class) {
-		NOUVEAU_ERR("Unknown NV4x chipset: NV%02x\n", chipset);
+		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", chipset);
 		return NULL;
 	}
 
-	nv40 = CALLOC_STRUCT(nv40_context);
-	if (!nv40)
+	cnv40 = calloc(1, sizeof(struct nv40_channel_context));
+	if (!cnv40)
 		return NULL;
-	nv40->chipset = chipset;
-	nv40->nvws = nvws;
+	cnv40->chipset = chipset;
+	cnv40->nvws = nvws;
 
 	/* Notifier for sync purposes */
-	ret = nvws->notifier_alloc(nvws, 1, &nv40->sync);
+	ret = nvws->notifier_alloc(nvws, 1, &cnv40->sync);
 	if (ret) {
 		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
-		nv40_destroy(&nv40->pipe);
+		nv40_channel_takedown(cnv40);
 		return NULL;
 	}
 
 	/* Query objects */
-	ret = nvws->notifier_alloc(nvws, 32, &nv40->query);
+	ret = nvws->notifier_alloc(nvws, 32, &cnv40->query);
 	if (ret) {
 		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
-		nv40_destroy(&nv40->pipe);
+		nv40_channel_takedown(cnv40);
 		return NULL;
 	}
 
-	ret = nvws->res_init(&nv40->query_heap, 0, 32);
+	ret = nvws->res_init(&cnv40->query_heap, 0, 32);
 	if (ret) {
 		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
-		nv40_destroy(&nv40->pipe);
+		nv40_channel_takedown(cnv40);
 		return NULL;
 	}
 
 	/* Vtxprog resources */
-	if (nvws->res_init(&nv40->vertprog.exec_heap, 0, 512) ||
-	    nvws->res_init(&nv40->vertprog.data_heap, 0, 256)) {
-		nv40_destroy(&nv40->pipe);
+	if (nvws->res_init(&cnv40->vp_exec_heap, 0, 512) ||
+	    nvws->res_init(&cnv40->vp_data_heap, 0, 256)) {
+		nv40_channel_takedown(cnv40);
 		return NULL;
 	}
 
+	/* 3D object */
+	ret = nvws->grobj_alloc(nvws, curie_class, &cnv40->curie);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
 	/* Static curie initialisation */
-	if (!nv40_init_hwctx(nv40, curie_class)) {
+	so = so_new(128, 0);
+	so_method(so, cnv40->curie, NV40TCL_DMA_NOTIFY, 1);
+	so_data  (so, cnv40->sync->handle);
+	so_method(so, cnv40->curie, NV40TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, cnv40->curie, NV40TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, cnv40->curie, NV40TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, cnv40->curie, NV40TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, cnv40->curie, NV40TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, cnv40->query->handle);
+	so_method(so, cnv40->curie, NV40TCL_DMA_UNK01AC, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, cnv40->curie, NV40TCL_DMA_COLOR2, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+
+	so_method(so, cnv40->curie, 0x1ea4, 3);
+	so_data  (so, 0x00000010);
+	so_data  (so, 0x01000100);
+	so_data  (so, 0xff800006);
+
+	/* vtxprog output routing */
+	so_method(so, cnv40->curie, 0x1fc4, 1);
+	so_data  (so, 0x06144321);
+	so_method(so, cnv40->curie, 0x1fc8, 2);
+	so_data  (so, 0xedcba987);
+	so_data  (so, 0x00000021);
+	so_method(so, cnv40->curie, 0x1fd0, 1);
+	so_data  (so, 0x00171615);
+	so_method(so, cnv40->curie, 0x1fd4, 1);
+	so_data  (so, 0x001b1a19);
+
+	so_method(so, cnv40->curie, 0x1ef8, 1);
+	so_data  (so, 0x0020ffff);
+	so_method(so, cnv40->curie, 0x1d64, 1);
+	so_data  (so, 0x00d30000);
+	so_method(so, cnv40->curie, 0x1e94, 1);
+	so_data  (so, 0x00000001);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws->channel, 0);
+
+	return cnv40;
+}
+
+static void
+nv40_destroy(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	if (nv40->draw)
+		draw_destroy(nv40->draw);
+
+	if (nv40->hw) {
+		if (--nv40->hw->refcount == 0)
+			nv40_channel_takedown(nv40->hw);
+	}
+
+	free(nv40);
+}
+
+struct pipe_context *
+nv40_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws,
+	    unsigned chipset)
+{
+	struct nv40_context *nv40;
+
+	nv40 = calloc(1, sizeof(struct nv40_context));
+	if (!nv40)
+		return NULL;
+
+	nv40->hw = nv40_channel_init(ws, nvws, chipset);
+	if (!nv40->hw) {
 		nv40_destroy(&nv40->pipe);
 		return NULL;
 	}
 
-	/* Pipe context setup */
-	nv40->pipe.winsys = pipe_winsys;
+	nv40->chipset = chipset;
+	nv40->nvws = nvws;
 
+	nv40->pipe.winsys = ws;
 	nv40->pipe.destroy = nv40_destroy;
 	nv40->pipe.get_name = nv40_get_name;
 	nv40->pipe.get_vendor = nv40_get_vendor;
 	nv40->pipe.get_param = nv40_get_param;
 	nv40->pipe.get_paramf = nv40_get_paramf;
-
 	nv40->pipe.draw_arrays = nv40_draw_arrays;
 	nv40->pipe.draw_elements = nv40_draw_elements;
 	nv40->pipe.clear = nv40_clear;
-
 	nv40->pipe.flush = nv40_flush;
 
 	nv40_init_query_functions(nv40);
diff --git a/src/mesa/pipe/nv40/nv40_context.h b/src/mesa/pipe/nv40/nv40_context.h
index 4aa34847e8..d7c9ee7851 100644
--- a/src/mesa/pipe/nv40/nv40_context.h
+++ b/src/mesa/pipe/nv40/nv40_context.h
@@ -11,7 +11,7 @@
 #include "pipe/nouveau/nouveau_gldefs.h"
 
 #define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv40_context *ctx = nv40
+	struct nv40_channel_context *ctx = nv40->hw
 #include "pipe/nouveau/nouveau_push.h"
 #include "pipe/nouveau/nouveau_stateobj.h"
 
@@ -35,20 +35,34 @@
 #define NV40_NEW_FRAGPROG	(1 << 10)
 #define NV40_NEW_ARRAYS		(1 << 11)
 
-struct nv40_context {
-	struct pipe_context pipe;
+struct nv40_channel_context {
 	struct nouveau_winsys *nvws;
+	unsigned refcount;
 
-	struct draw_context *draw;
+	unsigned chipset;
 
-	int chipset;
+	/* HW graphics objects */
 	struct nouveau_grobj *curie;
 	struct nouveau_notifier *sync;
 
-	/* query objects */
+	/* Query object resources */
 	struct nouveau_notifier *query;
 	struct nouveau_resource *query_heap;
 
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+};
+
+struct nv40_context {
+	struct pipe_context pipe;
+	struct nouveau_winsys *nvws;
+
+	struct nv40_channel_context *hw;
+	struct draw_context *draw;
+
+	int chipset;
+
 	uint32_t dirty;
 
 	struct nv40_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
@@ -69,9 +83,6 @@ struct nv40_context {
 	struct nouveau_stateobj *so_stipple;
 
 	struct {
-		struct nouveau_resource *exec_heap;
-		struct nouveau_resource *data_heap;
-
 		struct nv40_vertex_program *active;
 
 		struct nv40_vertex_program *current;
diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
index 1165661010..cb7b4a5e70 100644
--- a/src/mesa/pipe/nv40/nv40_fragprog.c
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -817,11 +817,11 @@ nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp)
 	}
 
 	so = so_new(4, 1);
-	so_method(so, nv40->curie, NV40TCL_FP_ADDRESS, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_FP_ADDRESS, 1);
 	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
 		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
-	so_method(so, nv40->curie, NV40TCL_FP_CONTROL, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_FP_CONTROL, 1);
 	so_data  (so, fp->fp_control);
 
 	so_emit(nv40->nvws, so);
diff --git a/src/mesa/pipe/nv40/nv40_fragtex.c b/src/mesa/pipe/nv40/nv40_fragtex.c
index c87e361831..d278ce1897 100644
--- a/src/mesa/pipe/nv40/nv40_fragtex.c
+++ b/src/mesa/pipe/nv40/nv40_fragtex.c
@@ -104,7 +104,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 	txs = tf->swizzle;
 
 	so = so_new(16, 2);
-	so_method(so, nv40->curie, NV40TCL_TEX_OFFSET(unit), 8);
+	so_method(so, nv40->hw->curie, NV40TCL_TEX_OFFSET(unit), 8);
 	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
 		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
@@ -116,7 +116,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
 		       pt->height[0]);
 	so_data  (so, ps->bcol);
-	so_method(so, nv40->curie, NV40TCL_TEX_SIZE1(unit), 1);
+	so_method(so, nv40->hw->curie, NV40TCL_TEX_SIZE1(unit), 1);
 	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
 
 	so_emit(nv40->nvws, so);
diff --git a/src/mesa/pipe/nv40/nv40_query.c b/src/mesa/pipe/nv40/nv40_query.c
index 06f41fe84f..eb305e6444 100644
--- a/src/mesa/pipe/nv40/nv40_query.c
+++ b/src/mesa/pipe/nv40/nv40_query.c
@@ -45,9 +45,9 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 
 	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
-	if (nv40->nvws->res_alloc(nv40->query_heap, 1, NULL, &q->object))
+	if (nv40->nvws->res_alloc(nv40->hw->query_heap, 1, NULL, &q->object))
 		assert(0);
-	nv40->nvws->notifier_reset(nv40->query, q->object->start);
+	nv40->nvws->notifier_reset(nv40->hw->query, q->object->start);
 
 	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
 	OUT_RING  (1);
@@ -82,16 +82,17 @@ nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 	if (!q->ready) {
 		unsigned status;
 
-		status = nvws->notifier_status(nv40->query, q->object->start);
+		status = nvws->notifier_status(nv40->hw->query,
+					       q->object->start);
 		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
 			if (wait == FALSE)
 				return FALSE;
-			nvws->notifier_wait(nv40->query, q->object->start,
+			nvws->notifier_wait(nv40->hw->query, q->object->start,
 					    NV_NOTIFY_STATE_STATUS_COMPLETED,
 					    0);
 		}
 
-		q->result = nvws->notifier_retval(nv40->query,
+		q->result = nvws->notifier_retval(nv40->hw->query,
 						  q->object->start);
 		q->ready = TRUE;
 		nvws->res_free(&q->object);
diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c
index ab53b03cb0..80e94737ef 100644
--- a/src/mesa/pipe/nv40/nv40_state.c
+++ b/src/mesa/pipe/nv40/nv40_state.c
@@ -10,39 +10,40 @@ nv40_blend_state_create(struct pipe_context *pipe,
 			const struct pipe_blend_state *cso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_grobj *curie = nv40->hw->curie;
 	struct nouveau_stateobj *so = so_new(16, 0);
 
 	if (cso->blend_enable) {
-		so_method(so, nv40->curie, NV40TCL_BLEND_ENABLE, 3);
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
 		so_data  (so, 1);
 		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
 			       nvgl_blend_func(cso->rgb_src_factor));
 		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
 			      nvgl_blend_func(cso->rgb_dst_factor));
-		so_method(so, nv40->curie, NV40TCL_BLEND_EQUATION, 1);
+		so_method(so, curie, NV40TCL_BLEND_EQUATION, 1);
 		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
 			      nvgl_blend_eqn(cso->rgb_func));
 	} else {
-		so_method(so, nv40->curie, NV40TCL_BLEND_ENABLE, 1);
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
-	so_method(so, nv40->curie, NV40TCL_COLOR_MASK, 1);
+	so_method(so, curie, NV40TCL_COLOR_MASK, 1);
 	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
 		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
 		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
 		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
 
 	if (cso->logicop_enable) {
-		so_method(so, nv40->curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
 		so_data  (so, 1);
 		so_data  (so, nvgl_logicop_func(cso->logicop_func));
 	} else {
-		so_method(so, nv40->curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
-	so_method(so, nv40->curie, NV40TCL_DITHER_ENABLE, 1);
+	so_method(so, curie, NV40TCL_DITHER_ENABLE, 1);
 	so_data  (so, cso->dither ? 1 : 0);
 
 	return (void *)so;
@@ -274,22 +275,22 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 	 * 	offset_units / offset_scale
 	 */
 
-	so_method(so, nv40->curie, NV40TCL_SHADE_MODEL, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_SHADE_MODEL, 1);
 	so_data  (so, cso->flatshade ? NV40TCL_SHADE_MODEL_FLAT :
 				       NV40TCL_SHADE_MODEL_SMOOTH);
 
-	so_method(so, nv40->curie, NV40TCL_LINE_WIDTH, 2);
+	so_method(so, nv40->hw->curie, NV40TCL_LINE_WIDTH, 2);
 	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
 	so_data  (so, cso->line_smooth ? 1 : 0);
-	so_method(so, nv40->curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	so_method(so, nv40->hw->curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
 	so_data  (so, cso->line_stipple_enable ? 1 : 0);
 	so_data  (so, (cso->line_stipple_pattern << 16) |
 		       cso->line_stipple_factor);
 
-	so_method(so, nv40->curie, NV40TCL_POINT_SIZE, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_POINT_SIZE, 1);
 	so_data  (so, fui(cso->point_size));
 
-	so_method(so, nv40->curie, NV40TCL_POLYGON_MODE_FRONT, 6);
+	so_method(so, nv40->hw->curie, NV40TCL_POLYGON_MODE_FRONT, 6);
 	if (cso->front_winding == PIPE_WINDING_CCW) {
 		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
 		so_data(so, nvgl_polygon_mode(cso->fill_cw));
@@ -330,10 +331,10 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 	so_data(so, cso->poly_smooth ? 1 : 0);
 	so_data(so, cso->cull_mode != PIPE_WINDING_NONE ? 1 : 0);
 
-	so_method(so, nv40->curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
 
-	so_method(so, nv40->curie, NV40TCL_POINT_SPRITE, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_POINT_SPRITE, 1);
 	if (cso->point_sprite) {
 		unsigned psctl = (1 << 0), i;
 
@@ -374,18 +375,18 @@ nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nouveau_stateobj *so = so_new(32, 0);
 
-	so_method(so, nv40->curie, NV40TCL_DEPTH_FUNC, 3);
+	so_method(so, nv40->hw->curie, NV40TCL_DEPTH_FUNC, 3);
 	so_data  (so, nvgl_comparison_op(cso->depth.func));
 	so_data  (so, cso->depth.writemask ? 1 : 0);
 	so_data  (so, cso->depth.enabled ? 1 : 0);
 
-	so_method(so, nv40->curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+	so_method(so, nv40->hw->curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
 	so_data  (so, cso->alpha.enabled ? 1 : 0);
 	so_data  (so, nvgl_comparison_op(cso->alpha.func));
 	so_data  (so, float_to_ubyte(cso->alpha.ref));
 
 	if (cso->stencil[0].enabled) {
-		so_method(so, nv40->curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
+		so_method(so, nv40->hw->curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
 		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
 		so_data  (so, cso->stencil[0].write_mask);
 		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
@@ -395,12 +396,12 @@ nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
 	} else {
-		so_method(so, nv40->curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
 	if (cso->stencil[1].enabled) {
-		so_method(so, nv40->curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
+		so_method(so, nv40->hw->curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
 		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
 		so_data  (so, cso->stencil[1].write_mask);
 		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
@@ -410,7 +411,7 @@ nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
 	} else {
-		so_method(so, nv40->curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
@@ -505,7 +506,7 @@ nv40_set_blend_color(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nouveau_stateobj *so = so_new(2, 0);
 
-	so_method(so, nv40->curie, NV40TCL_BLEND_COLOR, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_BLEND_COLOR, 1);
 	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
 		       (float_to_ubyte(bcol->color[0]) << 16) |
 		       (float_to_ubyte(bcol->color[1]) <<  8) |
@@ -611,73 +612,73 @@ nv40_set_framebuffer_state(struct pipe_context *pipe,
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-		so_method(so, nv40->curie, NV40TCL_DMA_COLOR0, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_DMA_COLOR0, 1);
 		so_reloc (so, rt[0]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
 			  nv40->nvws->channel->vram->handle,
 			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_method(so, nv40->hw->curie, NV40TCL_COLOR0_PITCH, 2);
 		so_data  (so, rt[0]->pitch * rt[0]->cpp);
 		so_reloc (so, rt[0]->buffer, rt[0]->offset, rt_flags |
 			  NOUVEAU_BO_LOW, 0, 0);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-		so_method(so, nv40->curie, NV40TCL_DMA_COLOR1, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_DMA_COLOR1, 1);
 		so_reloc (so, rt[1]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
 			  nv40->nvws->channel->vram->handle,
 			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_method(so, nv40->hw->curie, NV40TCL_COLOR1_OFFSET, 2);
 		so_reloc (so, rt[1]->buffer, rt[1]->offset, rt_flags |
 			  NOUVEAU_BO_LOW, 0, 0);
 		so_data  (so, rt[1]->pitch * rt[1]->cpp);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
-		so_method(so, nv40->curie, NV40TCL_DMA_COLOR2, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_DMA_COLOR2, 1);
 		so_reloc (so, rt[2]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
 			  nv40->nvws->channel->vram->handle,
 			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_COLOR2_OFFSET, 1);
 		so_reloc (so, rt[2]->buffer, rt[2]->offset, rt_flags |
 			  NOUVEAU_BO_LOW, 0, 0);
-		so_method(so, nv40->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_COLOR2_PITCH, 1);
 		so_data  (so, rt[2]->pitch * rt[2]->cpp);
 	}
 
 	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
-		so_method(so, nv40->curie, NV40TCL_DMA_COLOR3, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_DMA_COLOR3, 1);
 		so_reloc (so, rt[3]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
 			  nv40->nvws->channel->vram->handle,
 			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_COLOR3_OFFSET, 1);
 		so_reloc (so, rt[3]->buffer, rt[3]->offset, rt_flags |
 			  NOUVEAU_BO_LOW, 0, 0);
-		so_method(so, nv40->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_COLOR3_PITCH, 1);
 		so_data  (so, rt[3]->pitch * rt[3]->cpp);
 	}
 
 	if (zeta_format) {
-		so_method(so, nv40->curie, NV40TCL_DMA_ZETA, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_DMA_ZETA, 1);
 		so_reloc (so, zeta->buffer, 0, rt_flags | NOUVEAU_BO_OR,
 			  nv40->nvws->channel->vram->handle,
 			  nv40->nvws->channel->gart->handle);
-		so_method(so, nv40->curie, NV40TCL_ZETA_OFFSET, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_ZETA_OFFSET, 1);
 		so_reloc (so, zeta->buffer, zeta->offset, rt_flags |
 			  NOUVEAU_BO_LOW, 0, 0);
-		so_method(so, nv40->curie, NV40TCL_ZETA_PITCH, 1);
+		so_method(so, nv40->hw->curie, NV40TCL_ZETA_PITCH, 1);
 		so_data  (so, zeta->pitch * zeta->cpp);
 	}
 
-	so_method(so, nv40->curie, NV40TCL_RT_ENABLE, 1);
+	so_method(so, nv40->hw->curie, NV40TCL_RT_ENABLE, 1);
 	so_data  (so, rt_enable);
-	so_method(so, nv40->curie, NV40TCL_RT_HORIZ, 3);
+	so_method(so, nv40->hw->curie, NV40TCL_RT_HORIZ, 3);
 	so_data  (so, (w << 16) | 0);
 	so_data  (so, (h << 16) | 0);
 	so_data  (so, rt_format);
-	so_method(so, nv40->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_method(so, nv40->hw->curie, NV40TCL_VIEWPORT_HORIZ, 2);
 	so_data  (so, (w << 16) | 0);
 	so_data  (so, (h << 16) | 0);
-	so_method(so, nv40->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_method(so, nv40->hw->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
 	so_data  (so, ((w - 1) << 16) | 0);
 	so_data  (so, ((h - 1) << 16) | 0);
 
@@ -694,7 +695,7 @@ nv40_set_polygon_stipple(struct pipe_context *pipe,
 	struct nouveau_stateobj *so = so_new(33, 0);
 	unsigned i;
 
-	so_method(so, nv40->curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+	so_method(so, nv40->hw->curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 	for (i = 0; i < 32; i++)
 		so_data(so, stipple->stipple[i]);
 
@@ -710,7 +711,7 @@ nv40_set_scissor_state(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nouveau_stateobj *so = so_new(3, 0);
 
-	so_method(so, nv40->curie, NV40TCL_SCISSOR_HORIZ, 2);
+	so_method(so, nv40->hw->curie, NV40TCL_SCISSOR_HORIZ, 2);
 	so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
 	so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
 
@@ -726,7 +727,7 @@ nv40_set_viewport_state(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nouveau_stateobj *so = so_new(9, 0);
 
-	so_method(so, nv40->curie, NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+	so_method(so, nv40->hw->curie, NV40TCL_VIEWPORT_TRANSLATE_X, 8);
 	so_data  (so, fui(vpt->translate[0]));
 	so_data  (so, fui(vpt->translate[1]));
 	so_data  (so, fui(vpt->translate[2]));
diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
index fd1d884193..552058b3ae 100644
--- a/src/mesa/pipe/nv40/nv40_vbo.c
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -115,9 +115,9 @@ nv40_vbo_arrays_update(struct nv40_context *nv40, struct pipe_buffer *ib,
 	num_hw++;
 
 	vtxbuf = so_new(20, 18);
-	so_method(vtxbuf, nv40->curie, NV40TCL_VTXBUF_ADDRESS(0), num_hw);
+	so_method(vtxbuf, nv40->hw->curie, NV40TCL_VTXBUF_ADDRESS(0), num_hw);
 	vtxfmt = so_new(17, 0);
-	so_method(vtxfmt, nv40->curie, NV40TCL_VTXFMT(0), num_hw);
+	so_method(vtxfmt, nv40->hw->curie, NV40TCL_VTXFMT(0), num_hw);
 
 	inputs = vp->ir;
 	for (hw = 0; hw < num_hw; hw++) {
@@ -149,7 +149,7 @@ nv40_vbo_arrays_update(struct nv40_context *nv40, struct pipe_buffer *ib,
 	}
 
 	if (ib) {
-		so_method(vtxbuf, nv40->curie, NV40TCL_IDXBUF_ADDRESS, 2);
+		so_method(vtxbuf, nv40->hw->curie, NV40TCL_IDXBUF_ADDRESS, 2);
 		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
 		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
 			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
@@ -390,7 +390,7 @@ nv40_draw_elements(struct pipe_context *pipe,
 	/* 0x4497 doesn't support real index buffers, and there doesn't appear
 	 * to be support on any chipset for 8-bit indices.
 	 */
-	if (nv40->curie->grclass == NV44TCL || indexSize == 1) {
+	if (nv40->hw->curie->grclass == NV44TCL || indexSize == 1) {
 		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
 					  mode, start, count);
 	} else {
diff --git a/src/mesa/pipe/nv40/nv40_vertprog.c b/src/mesa/pipe/nv40/nv40_vertprog.c
index e15ddbbcde..415b3c70c7 100644
--- a/src/mesa/pipe/nv40/nv40_vertprog.c
+++ b/src/mesa/pipe/nv40/nv40_vertprog.c
@@ -648,7 +648,7 @@ nv40_vertprog_bind(struct nv40_context *nv40, struct nv40_vertex_program *vp)
 
 	/* Allocate hw vtxprog exec slots */
 	if (!vp->exec) {
-		struct nouveau_resource *heap = nv40->vertprog.exec_heap;
+		struct nouveau_resource *heap = nv40->hw->vp_exec_heap;
 		uint vplen = vp->nr_insns;
 
 		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
@@ -668,7 +668,7 @@ nv40_vertprog_bind(struct nv40_context *nv40, struct nv40_vertex_program *vp)
 
 	/* Allocate hw vtxprog const slots */
 	if (vp->nr_consts && !vp->data) {
-		struct nouveau_resource *heap = nv40->vertprog.data_heap;
+		struct nouveau_resource *heap = nv40->hw->vp_data_heap;
 
 		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
 			while (heap->next && heap->size < vp->nr_consts) {
-- 
cgit v1.2.3


From ce358b879c1823ab4fa04f56e208d679792667cc Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sun, 10 Feb 2008 15:19:18 +1100
Subject: nouveau: memory leak

---
 src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c b/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c
index 7d7fefa801..e1a9271395 100644
--- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c
+++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_winsys_pipe.c
@@ -136,6 +136,7 @@ nouveau_pipe_bo_del(struct pipe_winsys *ws, struct pipe_buffer *buf)
 	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
 
 	nouveau_bo_del(&nvbuf->bo);
+	free(nvbuf);
 }
 
 static void *
-- 
cgit v1.2.3


From 474f1a1d56fbb5472dd9bbf5828c413ae7e629dd Mon Sep 17 00:00:00 2001
From: Zack Rusin <zack@tungstengraphics.com>
Date: Fri, 25 Jan 2008 06:36:35 -0500
Subject: add a stub of a lowering pass

---
 src/mesa/pipe/llvm/Makefile         |  1 +
 src/mesa/pipe/llvm/gallivm.cpp      |  2 ++
 src/mesa/pipe/llvm/loweringpass.cpp | 17 +++++++++++++++++
 src/mesa/pipe/llvm/loweringpass.h   | 15 +++++++++++++++
 4 files changed, 35 insertions(+)
 create mode 100644 src/mesa/pipe/llvm/loweringpass.cpp
 create mode 100644 src/mesa/pipe/llvm/loweringpass.h

diff --git a/src/mesa/pipe/llvm/Makefile b/src/mesa/pipe/llvm/Makefile
index f655fb8340..1e3ae988df 100644
--- a/src/mesa/pipe/llvm/Makefile
+++ b/src/mesa/pipe/llvm/Makefile
@@ -8,6 +8,7 @@ LIBNAME = gallivm
 GALLIVM_SOURCES = \
         gallivm.cpp  \
         instructions.cpp  \
+        loweringpass.cpp \
         storage.cpp
 
 INC_SOURCES = gallivm_builtins.cpp llvm_base_shader.cpp
diff --git a/src/mesa/pipe/llvm/gallivm.cpp b/src/mesa/pipe/llvm/gallivm.cpp
index afa1446890..46e11c185a 100644
--- a/src/mesa/pipe/llvm/gallivm.cpp
+++ b/src/mesa/pipe/llvm/gallivm.cpp
@@ -34,6 +34,7 @@
 #include "gallivm.h"
 
 #include "instructions.h"
+#include "loweringpass.h"
 #include "storage.h"
 
 #include "pipe/p_context.h"
@@ -95,6 +96,7 @@ using namespace llvm;
 static int GLOBAL_ID = 0;
 
 static inline void AddStandardCompilePasses(PassManager &PM) {
+   PM.add(new LoweringPass());
    PM.add(createVerifierPass());                  // Verify that input is correct
 
    PM.add(createLowerSetJmpPass());          // Lower llvm.setjmp/.longjmp
diff --git a/src/mesa/pipe/llvm/loweringpass.cpp b/src/mesa/pipe/llvm/loweringpass.cpp
new file mode 100644
index 0000000000..556dbec366
--- /dev/null
+++ b/src/mesa/pipe/llvm/loweringpass.cpp
@@ -0,0 +1,17 @@
+#include "loweringpass.h"
+
+using namespace llvm;
+
+char LoweringPass::ID = 0;
+RegisterPass<LoweringPass> X("lowering", "Lowering Pass");
+
+LoweringPass::LoweringPass()
+   :  ModulePass((intptr_t)&ID)
+{
+}
+
+bool LoweringPass::runOnModule(Module &m)
+{
+   llvm::cerr << "Hello: " << m.getModuleIdentifier() << "\n";
+   return false;
+}
diff --git a/src/mesa/pipe/llvm/loweringpass.h b/src/mesa/pipe/llvm/loweringpass.h
new file mode 100644
index 0000000000..f62dcf6ba7
--- /dev/null
+++ b/src/mesa/pipe/llvm/loweringpass.h
@@ -0,0 +1,15 @@
+#ifndef LOWERINGPASS_H
+#define LOWERINGPASS_H
+
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+
+struct LoweringPass : public llvm::ModulePass
+{
+   static char ID;
+   LoweringPass();
+
+   virtual bool runOnModule(llvm::Module &m);
+};
+
+#endif
-- 
cgit v1.2.3


From 716206c190b1b1408c09807671d28dcc8906f855 Mon Sep 17 00:00:00 2001
From: Zack Rusin <zack@tungstengraphics.com>
Date: Mon, 4 Feb 2008 10:07:02 -0500
Subject: rewrite the way we handle ir in llvm code

introduce intermediate step gallivm_ir before compiling it
down to the final llvm ir.
---
 src/mesa/pipe/cell/ppu/cell_state_fs.c  |   2 +-
 src/mesa/pipe/draw/draw_vertex_shader.c |  27 +-
 src/mesa/pipe/llvm/Makefile             |   1 +
 src/mesa/pipe/llvm/gallivm.cpp          | 791 +++-----------------------------
 src/mesa/pipe/llvm/gallivm.h            |  23 +-
 src/mesa/pipe/llvm/gallivm_p.h          |  56 +++
 src/mesa/pipe/llvm/llvm_base_shader.cpp | 435 ++++++++----------
 src/mesa/pipe/llvm/llvm_entry.c         |  14 +-
 src/mesa/pipe/llvm/tgsitollvm.cpp       | 682 +++++++++++++++++++++++++++
 src/mesa/pipe/llvm/tgsitollvm.h         |  16 +
 src/mesa/pipe/softpipe/sp_state_fs.c    |   2 +-
 11 files changed, 1079 insertions(+), 970 deletions(-)
 create mode 100644 src/mesa/pipe/llvm/gallivm_p.h
 create mode 100644 src/mesa/pipe/llvm/tgsitollvm.cpp
 create mode 100644 src/mesa/pipe/llvm/tgsitollvm.h

diff --git a/src/mesa/pipe/cell/ppu/cell_state_fs.c b/src/mesa/pipe/cell/ppu/cell_state_fs.c
index 96a52273b0..3f46a87d18 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_fs.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_fs.c
@@ -67,7 +67,7 @@ cell_create_fs_state(struct pipe_context *pipe,
 #endif
 
 #ifdef MESA_LLVM
-   state->llvm_prog = gallivm_from_tgsi(state->shader.tokens, GALLIVM_FS);
+   state->llvm_prog = 0;
    if (!gallivm_global_cpu_engine()) {
       gallivm_cpu_engine_create(state->llvm_prog);
    }
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 5ca93aa615..9567283ff5 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -113,7 +113,16 @@ run_vertex_program(struct draw_context *draw,
    draw->vertex_fetch.fetch_func( draw, machine, elts, count );
 
    /* run shader */
-#if defined(__i386__) || defined(__386__)
+#ifdef MESA_LLVM
+   if (1) {
+   struct gallivm_prog  *prog  = draw->vertex_shader->llvm_prog;
+   gallivm_prog_exec(prog,
+                     machine->Inputs,
+                     machine->Outputs,
+                     machine->Consts,
+                     12, 12, 12);
+   } else
+#elif defined(__i386__) || defined(__386__)
    if (draw->use_sse) {
       /* SSE */
       /* cast away const */
@@ -212,13 +221,7 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
     */
    draw_update_vertex_fetch( draw );
 
-//   debug_printf( " q(%d) ", draw->vs.queue_nr );
-#ifdef MESA_LLVM
-   if (draw->vertex_shader->llvm_prog) {
-      draw_vertex_shader_queue_flush_llvm(draw);
-      return;
-   }
-#endif
+//   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
 
    /* run vertex shader on vertex cache entries, four per invokation */
    for (i = 0; i < draw->vs.queue_nr; i += 4) {
@@ -260,7 +263,13 @@ draw_create_vertex_shader(struct draw_context *draw,
    vs->state = shader;
 
 #ifdef MESA_LLVM
-   vs->llvm_prog = gallivm_from_tgsi(shader->tokens, GALLIVM_VS);
+   struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
+   gallivm_ir_set_layout(ir, GALLIVM_SOA);
+   gallivm_ir_set_components(ir, 4);
+   gallivm_ir_fill_from_tgsi(ir, shader->tokens);
+   vs->llvm_prog = gallivm_ir_compile(ir);
+   gallivm_ir_delete(ir);
+
    draw->engine = gallivm_global_cpu_engine();
    if (!draw->engine) {
       draw->engine = gallivm_cpu_engine_create(vs->llvm_prog);
diff --git a/src/mesa/pipe/llvm/Makefile b/src/mesa/pipe/llvm/Makefile
index 1e3ae988df..10ff7aacae 100644
--- a/src/mesa/pipe/llvm/Makefile
+++ b/src/mesa/pipe/llvm/Makefile
@@ -9,6 +9,7 @@ GALLIVM_SOURCES = \
         gallivm.cpp  \
         instructions.cpp  \
         loweringpass.cpp \
+        tgsitollvm.cpp \
         storage.cpp
 
 INC_SOURCES = gallivm_builtins.cpp llvm_base_shader.cpp
diff --git a/src/mesa/pipe/llvm/gallivm.cpp b/src/mesa/pipe/llvm/gallivm.cpp
index 46e11c185a..cf9b0f6406 100644
--- a/src/mesa/pipe/llvm/gallivm.cpp
+++ b/src/mesa/pipe/llvm/gallivm.cpp
@@ -32,17 +32,17 @@
 #ifdef MESA_LLVM
 
 #include "gallivm.h"
+#include "gallivm_p.h"
 
 #include "instructions.h"
 #include "loweringpass.h"
 #include "storage.h"
+#include "tgsitollvm.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+
 #include "pipe/tgsi/exec/tgsi_exec.h"
-#include "pipe/tgsi/util/tgsi_util.h"
-#include "pipe/tgsi/util/tgsi_build.h"
 #include "pipe/tgsi/util/tgsi_dump.h"
 
 #include <llvm/Module.h>
@@ -64,38 +64,23 @@
 #include <llvm/Analysis/LoopPass.h>
 #include <llvm/Target/TargetData.h>
 #include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
 
 #include <sstream>
 #include <fstream>
 #include <iostream>
 
-struct gallivm_interpolate {
-   int attrib;
-   int chan;
-   int type;
-};
-
-struct gallivm_prog {
-   llvm::Module *module;
-   void *function;
-   int   num_consts;
-   int   id;
-   enum gallivm_shader_type type;
-
-   struct gallivm_interpolate interpolators[32*4]; //FIXME: this might not be enough for some shaders
-   int   num_interp;
-};
-
 struct gallivm_cpu_engine {
    llvm::ExecutionEngine *engine;
 };
 
-using namespace llvm;
-#include "llvm_base_shader.cpp"
-
 static int GLOBAL_ID = 0;
 
-static inline void AddStandardCompilePasses(PassManager &PM) {
+using namespace llvm;
+
+static inline
+void AddStandardCompilePasses(PassManager &PM)
+{
    PM.add(new LoweringPass());
    PM.add(createVerifierPass());                  // Verify that input is correct
 
@@ -152,691 +137,16 @@ static inline void AddStandardCompilePasses(PassManager &PM) {
    PM.add(createConstantMergePass());        // Merge dup global constants
 }
 
-static inline void
-add_interpolator(struct gallivm_prog *prog,
-                 struct gallivm_interpolate *interp)
-{
-   prog->interpolators[prog->num_interp] = *interp;
-   ++prog->num_interp;
-}
-
-static void
-translate_declaration(struct gallivm_prog *prog,
-                      llvm::Module *module,
-                      Storage *storage,
-                      struct tgsi_full_declaration *decl,
-                      struct tgsi_full_declaration *fd)
-{
-   if (decl->Declaration.File == TGSI_FILE_INPUT) {
-      unsigned first, last, mask;
-      uint interp_method;
-
-      assert(decl->Declaration.Declare == TGSI_DECLARE_RANGE);
-
-      first = decl->u.DeclarationRange.First;
-      last = decl->u.DeclarationRange.Last;
-      mask = decl->Declaration.UsageMask;
-
-      /* Do not touch WPOS.xy */
-      if (first == 0) {
-         mask &= ~TGSI_WRITEMASK_XY;
-         if (mask == TGSI_WRITEMASK_NONE) {
-            first++;
-            if (first > last) {
-               return;
-            }
-         }
-      }
-
-      interp_method = decl->Interpolation.Interpolate;
-
-      if (mask == TGSI_WRITEMASK_XYZW) {
-         unsigned i, j;
-
-         for (i = first; i <= last; i++) {
-            for (j = 0; j < NUM_CHANNELS; j++) {
-               //interp( mach, i, j );
-               struct gallivm_interpolate interp;
-               interp.type = interp_method;
-               interp.attrib = i;
-               interp.chan = j;
-               add_interpolator(prog, &interp);
-            }
-         }
-      } else {
-         unsigned i, j;
-         for( j = 0; j < NUM_CHANNELS; j++ ) {
-            if( mask & (1 << j) ) {
-               for( i = first; i <= last; i++ ) {
-                  struct gallivm_interpolate interp;
-                  interp.type = interp_method;
-                  interp.attrib = i;
-                  interp.chan = j;
-                  add_interpolator(prog, &interp);
-               }
-            }
-         }
-      }
-   }
-}
-
-
-static void
-translate_immediate(Storage *storage,
-                    struct tgsi_full_immediate *imm)
-{
-   float vec[4];
-   int i;
-   for (i = 0; i < imm->Immediate.Size - 1; ++i) {
-      switch( imm->Immediate.DataType ) {
-      case TGSI_IMM_FLOAT32:
-         vec[i] = imm->u.ImmediateFloat32[i].Float;
-         break;
-      default:
-         assert( 0 );
-      }
-   }
-   storage->addImmediate(vec);
-}
-
-static inline llvm::Value *
-swizzleVector(llvm::Value *val, struct tgsi_full_src_register *src,
-              Storage *storage)
-{
-   int swizzle = 0;
-   int start = 1000;
-   const int NO_SWIZZLE = TGSI_SWIZZLE_X * 1000 + TGSI_SWIZZLE_Y * 100 +
-                          TGSI_SWIZZLE_Z * 10 + TGSI_SWIZZLE_W;
-   for (int k = 0; k < 4; ++k) {
-      swizzle += tgsi_util_get_full_src_register_extswizzle(src, k) * start;
-      start /= 10;
-   }
-   if (swizzle != NO_SWIZZLE) {
-      /*fprintf(stderr, "XXXXXXXX swizzle = %d\n", swizzle);*/
-      val = storage->shuffleVector(val, swizzle);
-   }
-   return val;
-}
-
-static void
-translate_instruction(llvm::Module *module,
-                      Storage *storage,
-                      Instructions *instr,
-                      struct tgsi_full_instruction *inst,
-                      struct tgsi_full_instruction *fi,
-                      unsigned instno)
-{
-   llvm::Value *inputs[4];
-   inputs[0] = 0;
-   inputs[1] = 0;
-   inputs[2] = 0;
-   inputs[3] = 0;
-
-   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
-      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
-      llvm::Value *val = 0;
-      llvm::Value *indIdx = 0;
-
-      if (src->SrcRegister.Indirect) {
-         indIdx = storage->addrElement(src->SrcRegisterInd.Index);
-         indIdx = storage->extractIndex(indIdx);
-      }
-      if (src->SrcRegister.File == TGSI_FILE_CONSTANT) {
-         val = storage->constElement(src->SrcRegister.Index, indIdx);
-      } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
-         val = storage->inputElement(src->SrcRegister.Index, indIdx);
-      } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
-         val = storage->tempElement(src->SrcRegister.Index);
-      } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
-         val = storage->outputElement(src->SrcRegister.Index, indIdx);
-      } else if (src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
-         val = storage->immediateElement(src->SrcRegister.Index);
-      } else {
-         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->SrcRegister.File);
-         return;
-      }
-
-      inputs[i] = swizzleVector(val, src, storage);
-   }
-
-   /*if (inputs[0])
-     instr->printVector(inputs[0]);
-     if (inputs[1])
-     instr->printVector(inputs[1]);*/
-   llvm::Value *out = 0;
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL: {
-      out = instr->arl(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_MOV: {
-      out = inputs[0];
-   }
-      break;
-   case TGSI_OPCODE_LIT: {
-      out = instr->lit(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RCP: {
-      out = instr->rcp(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RSQ: {
-      out = instr->rsq(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_EXP:
-      break;
-   case TGSI_OPCODE_LOG:
-      break;
-   case TGSI_OPCODE_MUL: {
-      out = instr->mul(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_ADD: {
-      out = instr->add(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DP3: {
-      out = instr->dp3(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DP4: {
-      out = instr->dp4(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DST: {
-      out = instr->dst(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MIN: {
-      out = instr->min(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MAX: {
-      out = instr->max(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SLT: {
-      out = instr->slt(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SGE: {
-      out = instr->sge(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MAD: {
-      out = instr->madd(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_SUB: {
-      out = instr->sub(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_LERP: {
-      out = instr->lerp(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_CND:
-      break;
-   case TGSI_OPCODE_CND0:
-      break;
-   case TGSI_OPCODE_DOT2ADD:
-      break;
-   case TGSI_OPCODE_INDEX:
-      break;
-   case TGSI_OPCODE_NEGATE:
-      break;
-   case TGSI_OPCODE_FRAC: {
-      out = instr->frc(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_CLAMP:
-      break;
-   case TGSI_OPCODE_FLOOR: {
-      out = instr->floor(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_ROUND:
-      break;
-   case TGSI_OPCODE_EXPBASE2: {
-      out = instr->ex2(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_LOGBASE2: {
-      out = instr->lg2(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_POWER: {
-      out = instr->pow(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_CROSSPRODUCT: {
-      out = instr->cross(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MULTIPLYMATRIX:
-      break;
-   case TGSI_OPCODE_ABS: {
-      out = instr->abs(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RCC:
-      break;
-   case TGSI_OPCODE_DPH: {
-      out = instr->dph(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_COS: {
-      out = instr->cos(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_DDX:
-      break;
-   case TGSI_OPCODE_DDY:
-      break;
-   case TGSI_OPCODE_KILP: {
-      out = instr->kilp(inputs[0]);
-      storage->setKilElement(out);
-      return;
-   }
-      break;
-   case TGSI_OPCODE_PK2H:
-      break;
-   case TGSI_OPCODE_PK2US:
-      break;
-   case TGSI_OPCODE_PK4B:
-      break;
-   case TGSI_OPCODE_PK4UB:
-      break;
-   case TGSI_OPCODE_RFL:
-      break;
-   case TGSI_OPCODE_SEQ:
-      break;
-   case TGSI_OPCODE_SFL:
-      break;
-   case TGSI_OPCODE_SGT: {
-      out = instr->sgt(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SIN: {
-      out = instr->sin(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_SLE:
-      break;
-   case TGSI_OPCODE_SNE:
-      break;
-   case TGSI_OPCODE_STR:
-      break;
-   case TGSI_OPCODE_TEX:
-      break;
-   case TGSI_OPCODE_TXD:
-      break;
-   case TGSI_OPCODE_UP2H:
-      break;
-   case TGSI_OPCODE_UP2US:
-      break;
-   case TGSI_OPCODE_UP4B:
-      break;
-   case TGSI_OPCODE_UP4UB:
-      break;
-   case TGSI_OPCODE_X2D:
-      break;
-   case TGSI_OPCODE_ARA:
-      break;
-   case TGSI_OPCODE_ARR:
-      break;
-   case TGSI_OPCODE_BRA:
-      break;
-   case TGSI_OPCODE_CAL: {
-      instr->cal(inst->InstructionExtLabel.Label, storage->inputPtr());
-      return;
-   }
-      break;
-   case TGSI_OPCODE_RET: {
-      instr->end();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_SSG:
-      break;
-   case TGSI_OPCODE_CMP: {
-      out = instr->cmp(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_SCS: {
-      out = instr->scs(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_TXB:
-      break;
-   case TGSI_OPCODE_NRM:
-      break;
-   case TGSI_OPCODE_DIV:
-      break;
-   case TGSI_OPCODE_DP2:
-      break;
-   case TGSI_OPCODE_TXL:
-      break;
-   case TGSI_OPCODE_BRK: {
-      instr->brk();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_IF: {
-      instr->ifop(inputs[0]);
-      storage->setCurrentBlock(instr->currentBlock());
-      return;  //just update the state
-   }
-      break;
-   case TGSI_OPCODE_LOOP:
-      break;
-   case TGSI_OPCODE_REP:
-      break;
-   case TGSI_OPCODE_ELSE: {
-      instr->elseop();
-      storage->setCurrentBlock(instr->currentBlock());
-      return; //only state update
-   }
-      break;
-   case TGSI_OPCODE_ENDIF: {
-      instr->endif();
-      storage->setCurrentBlock(instr->currentBlock());
-      return; //just update the state
-   }
-      break;
-   case TGSI_OPCODE_ENDLOOP:
-      break;
-   case TGSI_OPCODE_ENDREP:
-      break;
-   case TGSI_OPCODE_PUSHA:
-      break;
-   case TGSI_OPCODE_POPA:
-      break;
-   case TGSI_OPCODE_CEIL:
-      break;
-   case TGSI_OPCODE_I2F:
-      break;
-   case TGSI_OPCODE_NOT:
-      break;
-   case TGSI_OPCODE_TRUNC: {
-      out = instr->trunc(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_SHL:
-      break;
-   case TGSI_OPCODE_SHR:
-      break;
-   case TGSI_OPCODE_AND:
-      break;
-   case TGSI_OPCODE_OR:
-      break;
-   case TGSI_OPCODE_MOD:
-      break;
-   case TGSI_OPCODE_XOR:
-      break;
-   case TGSI_OPCODE_SAD:
-      break;
-   case TGSI_OPCODE_TXF:
-      break;
-   case TGSI_OPCODE_TXQ:
-      break;
-   case TGSI_OPCODE_CONT:
-      break;
-   case TGSI_OPCODE_EMIT:
-      break;
-   case TGSI_OPCODE_ENDPRIM:
-      break;
-   case TGSI_OPCODE_BGNLOOP2: {
-      instr->beginLoop();
-      storage->setCurrentBlock(instr->currentBlock());
-      return;
-   }
-      break;
-   case TGSI_OPCODE_BGNSUB: {
-      instr->bgnSub(instno);
-      storage->setCurrentBlock(instr->currentBlock());
-      storage->pushTemps();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_ENDLOOP2: {
-      instr->endLoop();
-      storage->setCurrentBlock(instr->currentBlock());
-      return;
-   }
-      break;
-   case TGSI_OPCODE_ENDSUB: {
-      instr->endSub();
-      storage->setCurrentBlock(instr->currentBlock());
-      storage->popArguments();
-      storage->popTemps();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_NOISE1:
-      break;
-   case TGSI_OPCODE_NOISE2:
-      break;
-   case TGSI_OPCODE_NOISE3:
-      break;
-   case TGSI_OPCODE_NOISE4:
-      break;
-   case TGSI_OPCODE_NOP:
-      break;
-   case TGSI_OPCODE_TEXBEM:
-      break;
-   case TGSI_OPCODE_TEXBEML:
-      break;
-   case TGSI_OPCODE_TEXREG2AR:
-      break;
-   case TGSI_OPCODE_TEXM3X2PAD:
-      break;
-   case TGSI_OPCODE_TEXM3X2TEX:
-      break;
-   case TGSI_OPCODE_TEXM3X3PAD:
-      break;
-   case TGSI_OPCODE_TEXM3X3TEX:
-      break;
-   case TGSI_OPCODE_TEXM3X3SPEC:
-      break;
-   case TGSI_OPCODE_TEXM3X3VSPEC:
-      break;
-   case TGSI_OPCODE_TEXREG2GB:
-      break;
-   case TGSI_OPCODE_TEXREG2RGB:
-      break;
-   case TGSI_OPCODE_TEXDP3TEX:
-      break;
-   case TGSI_OPCODE_TEXDP3:
-      break;
-   case TGSI_OPCODE_TEXM3X3:
-      break;
-   case TGSI_OPCODE_TEXM3X2DEPTH:
-      break;
-   case TGSI_OPCODE_TEXDEPTH:
-      break;
-   case TGSI_OPCODE_BEM:
-      break;
-   case TGSI_OPCODE_M4X3:
-      break;
-   case TGSI_OPCODE_M3X4:
-      break;
-   case TGSI_OPCODE_M3X3:
-      break;
-   case TGSI_OPCODE_M3X2:
-      break;
-   case TGSI_OPCODE_NRM4:
-      break;
-   case TGSI_OPCODE_CALLNZ:
-      break;
-   case TGSI_OPCODE_IFC:
-      break;
-   case TGSI_OPCODE_BREAKC:
-      break;
-   case TGSI_OPCODE_KIL:
-      break;
-   case TGSI_OPCODE_END:
-      instr->end();
-      return;
-      break;
-   default:
-      fprintf(stderr, "ERROR: Unknown opcode %d\n",
-              inst->Instruction.Opcode);
-      assert(0);
-      break;
-   }
-
-   if (!out) {
-      fprintf(stderr, "ERROR: unsupported opcode %d\n",
-              inst->Instruction.Opcode);
-      assert(!"Unsupported opcode");
-   }
-
-   /* # not sure if we need this */
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-   case TGSI_SAT_ZERO_ONE:
-      /*TXT( "_SAT" );*/
-      break;
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      /*TXT( "_SAT[-1,1]" );*/
-      break;
-   default:
-      assert( 0 );
-   }
-
-   /* store results  */
-   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
-      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
-      if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
-         storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
-         storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
-         storage->setAddrElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      } else {
-         fprintf(stderr, "ERROR: unsupported LLVM destination!");
-         assert(!"wrong destination");
-      }
-   }
-}
-
-static llvm::Module *
-tgsi_to_llvm(struct gallivm_prog *prog, const struct tgsi_token *tokens)
-{
-   llvm::Module *mod = createBaseShader();
-   struct tgsi_parse_context parse;
-   struct tgsi_full_instruction fi;
-   struct tgsi_full_declaration fd;
-   unsigned instno = 0;
-   Function* shader = mod->getFunction("execute_shader");
-   std::ostringstream stream;
-   if (prog->type == GALLIVM_VS) {
-      stream << "vs_shader";
-   } else {
-      stream << "fs_shader";
-   }
-   stream << prog->id;
-   std::string func_name = stream.str();
-   shader->setName(func_name.c_str());
-
-   Function::arg_iterator args = shader->arg_begin();
-   Value *ptr_INPUT = args++;
-   ptr_INPUT->setName("input");
-
-   BasicBlock *label_entry = new BasicBlock("entry", shader, 0);
-
-   tgsi_parse_init(&parse, tokens);
-
-   fi = tgsi_default_full_instruction();
-   fd = tgsi_default_full_declaration();
-   Storage storage(label_entry, ptr_INPUT);
-   Instructions instr(mod, shader, label_entry, &storage);
-   while(!tgsi_parse_end_of_tokens(&parse)) {
-      tgsi_parse_token(&parse);
-
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         translate_declaration(prog, mod, &storage,
-                               &parse.FullToken.FullDeclaration,
-                               &fd);
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         translate_immediate(&storage,
-                             &parse.FullToken.FullImmediate);
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         translate_instruction(mod, &storage, &instr,
-                               &parse.FullToken.FullInstruction,
-                               &fi, instno);
-         ++instno;
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-
-   tgsi_parse_free(&parse);
-
-   prog->num_consts = storage.numConsts();
-   return mod;
-}
-
-/*!
-  Translates the TGSI tokens into LLVM format. Translated representation
-  is stored in the gallivm_prog and returned.
-  After calling this function the gallivm_prog can either be used with a custom
-  code generator to generate machine code for the GPU which the code generator
-  addresses or it can be jit compiled with gallivm_cpu_jit_compile and executed
-  with gallivm_prog_exec to run the module on the CPU.
- */
-struct gallivm_prog *
-gallivm_from_tgsi(const struct tgsi_token *tokens, enum gallivm_shader_type type)
-{
-   std::cout << "Creating llvm from: " <<std::endl;
-   ++GLOBAL_ID;
-   struct gallivm_prog *gallivm =
-      (struct gallivm_prog *)calloc(1, sizeof(struct gallivm_prog));
-   gallivm->id = GLOBAL_ID;
-   gallivm->type = type;
-   tgsi_dump(tokens, 0);
-
-   llvm::Module *mod = tgsi_to_llvm(gallivm, tokens);
-   gallivm->module = mod;
-   gallivm_prog_dump(gallivm, 0);
-
-   /* Run optimization passes over it */
-   PassManager passes;
-   passes.add(new TargetData(mod));
-   AddStandardCompilePasses(passes);
-   passes.run(*mod);
-
-   gallivm->module = mod;
-
-   gallivm_prog_dump(gallivm, 0);
-
-   return gallivm;
-}
-
-
 void gallivm_prog_delete(struct gallivm_prog *prog)
 {
-   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
-   delete mod;
+   delete prog->module;
    prog->module = 0;
    prog->function = 0;
    free(prog);
 }
 
-typedef void (*vertex_shader_runner)(float (*ainputs)[PIPE_MAX_SHADER_INPUTS][4],
-                                     float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
+typedef void (*vertex_shader_runner)(void *ainputs,
+                                     void *dests,
                                      float (*aconsts)[4],
                                      int num_vertices,
                                      int num_inputs,
@@ -850,8 +160,8 @@ typedef void (*vertex_shader_runner)(float (*ainputs)[PIPE_MAX_SHADER_INPUTS][4]
   function.
  */
 int gallivm_prog_exec(struct gallivm_prog *prog,
-                      float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
-                      float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
+                      struct tgsi_exec_vector       *inputs,
+                      struct tgsi_exec_vector       *dests,
                       float (*consts)[4],
                       int num_vertices,
                       int num_inputs,
@@ -943,18 +253,15 @@ int gallivm_fragment_shader_exec(struct gallivm_prog *prog,
                  samplers);
 }
 
-void gallivm_prog_dump(struct gallivm_prog *prog, const char *file_prefix)
+void gallivm_ir_dump(struct gallivm_ir *ir, const char *file_prefix)
 {
-   llvm::Module *mod;
-   if (!prog || !prog->module)
+   if (!ir || !ir->module)
       return;
 
-   mod = static_cast<llvm::Module*>(prog->module);
-
    if (file_prefix) {
       std::ostringstream stream;
       stream << file_prefix;
-      stream << prog->id;
+      stream << ir->id;
       stream << ".ll";
       std::string name = stream.str();
       std::ofstream out(name.c_str());
@@ -962,12 +269,12 @@ void gallivm_prog_dump(struct gallivm_prog *prog, const char *file_prefix)
          std::cerr<<"Can't open file : "<<stream.str()<<std::endl;;
          return;
       }
-      out << (*mod);
+      out << (*ir->module);
       out.close();
    } else {
-      const llvm::Module::FunctionListType &funcs = mod->getFunctionList();
+      const llvm::Module::FunctionListType &funcs = ir->module->getFunctionList();
       llvm::Module::FunctionListType::const_iterator itr;
-      std::cout<<"; ---------- Start shader "<<prog->id<<std::endl;
+      std::cout<<"; ---------- Start shader "<<ir->id<<std::endl;
       for (itr = funcs.begin(); itr != funcs.end(); ++itr) {
          const llvm::Function &func = (*itr);
          std::string name = func.getName();
@@ -980,7 +287,7 @@ void gallivm_prog_dump(struct gallivm_prog *prog, const char *file_prefix)
             std::cout<<*found<<std::endl;
          }
       }
-      std::cout<<"; ---------- End shader "<<prog->id<<std::endl;
+      std::cout<<"; ---------- End shader "<<ir->id<<std::endl;
    }
 }
 
@@ -1086,9 +393,63 @@ void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
    }
 }
 
-#endif /* MESA_LLVM */
 
+struct gallivm_ir * gallivm_ir_new(enum gallivm_shader_type type)
+{
+   struct gallivm_ir *ir =
+      (struct gallivm_ir *)calloc(1, sizeof(struct gallivm_ir));
+   ++GLOBAL_ID;
+   ir->id   = GLOBAL_ID;
+   ir->type = type;
 
+   return ir;
+}
 
+void gallivm_ir_set_layout(struct gallivm_ir *ir,
+                           enum gallivm_vector_layout layout)
+{
+   ir->layout = layout;
+}
 
+void gallivm_ir_set_components(struct gallivm_ir *ir, int num)
+{
+   ir->num_components = num;
+}
 
+void gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
+                               const struct tgsi_token *tokens)
+{
+   std::cout << "Creating llvm from: " <<std::endl;
+   tgsi_dump(tokens, 0);
+
+   llvm::Module *mod = tgsi_to_llvm(ir, tokens);
+   ir->module = mod;
+   gallivm_ir_dump(ir, 0);
+}
+
+void gallivm_ir_delete(struct gallivm_ir *ir)
+{
+   delete ir->module;
+   free(ir);
+}
+
+struct gallivm_prog * gallivm_ir_compile(struct gallivm_ir *ir)
+{
+   struct gallivm_prog *prog =
+      (struct gallivm_prog *)calloc(1, sizeof(struct gallivm_prog));
+   llvm::Module *mod = llvm::CloneModule(ir->module);
+   prog->num_consts = ir->num_consts;
+   memcpy(prog->interpolators, ir->interpolators, sizeof(prog->interpolators));
+   prog->num_interp = ir->num_interp;
+
+   /* Run optimization passes over it */
+   PassManager passes;
+   passes.add(new TargetData(mod));
+   AddStandardCompilePasses(passes);
+   passes.run(*mod);
+   prog->module = mod;
+
+   return prog;
+}
+
+#endif /* MESA_LLVM */
diff --git a/src/mesa/pipe/llvm/gallivm.h b/src/mesa/pipe/llvm/gallivm.h
index 4695de3127..f9f5d5ee74 100644
--- a/src/mesa/pipe/llvm/gallivm.h
+++ b/src/mesa/pipe/llvm/gallivm.h
@@ -43,21 +43,36 @@ extern "C" {
 
 struct tgsi_token;
 
+struct gallivm_ir;
 struct gallivm_prog;
 struct gallivm_cpu_engine;
 struct tgsi_interp_coef;
 struct tgsi_sampler;
+struct tgsi_exec_vector;
 
 enum gallivm_shader_type {
    GALLIVM_VS,
    GALLIVM_FS
 };
 
-struct gallivm_prog *gallivm_from_tgsi(const struct tgsi_token *tokens, enum gallivm_shader_type type);
-void gallivm_prog_delete(struct gallivm_prog *prog);
+enum gallivm_vector_layout {
+   GALLIVM_AOS,
+   GALLIVM_SOA
+};
+
+struct gallivm_ir *gallivm_ir_new(enum gallivm_shader_type type);
+void gallivm_ir_set_layout(struct gallivm_ir *prog,
+                           enum gallivm_vector_layout layout);
+void gallivm_ir_set_components(struct gallivm_ir *prog, int num);
+void gallivm_ir_fill_from_tgsi(struct gallivm_ir *prog,
+                               const struct tgsi_token *tokens);
+void gallivm_ir_delete(struct gallivm_ir *prog);
+
+struct gallivm_prog *gallivm_ir_compile(struct gallivm_ir *ir);
+
 int gallivm_prog_exec(struct gallivm_prog *prog,
-                      float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
-                      float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
+                      struct tgsi_exec_vector       *inputs,
+                      struct tgsi_exec_vector       *dests,
                       float (*consts)[4],
                       int num_vertices,
                       int num_inputs,
diff --git a/src/mesa/pipe/llvm/gallivm_p.h b/src/mesa/pipe/llvm/gallivm_p.h
new file mode 100644
index 0000000000..2c6e5e8f5f
--- /dev/null
+++ b/src/mesa/pipe/llvm/gallivm_p.h
@@ -0,0 +1,56 @@
+#ifndef GALLIVM_P_H
+#define GALLIVM_P_H
+
+#ifdef MESA_LLVM
+
+namespace llvm {
+   class Module;
+}
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+enum gallivm_shader_type;
+enum gallivm_vector_layout;
+
+struct gallivm_interpolate {
+   int attrib;
+   int chan;
+   int type;
+};
+
+struct gallivm_ir {
+   llvm::Module *module;
+   int id;
+   enum gallivm_shader_type type;
+   enum gallivm_vector_layout layout;
+   int num_components;
+   int   num_consts;
+
+   //FIXME: this might not be enough for some shaders
+   struct gallivm_interpolate interpolators[32*4];
+   int   num_interp;
+};
+
+struct gallivm_prog {
+   llvm::Module *module;
+   void *function;
+
+   int   id;
+   enum gallivm_shader_type type;
+
+   int   num_consts;
+
+   //FIXME: this might not be enough for some shaders
+   struct gallivm_interpolate interpolators[32*4];
+   int   num_interp;
+};
+
+#endif /* MESA_LLVM */
+
+#if defined __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/src/mesa/pipe/llvm/llvm_base_shader.cpp b/src/mesa/pipe/llvm/llvm_base_shader.cpp
index b574b550ae..90a25a440a 100644
--- a/src/mesa/pipe/llvm/llvm_base_shader.cpp
+++ b/src/mesa/pipe/llvm/llvm_base_shader.cpp
@@ -104,8 +104,8 @@ Module* createBaseShader() {
     /*isVarArg=*/false);
   
   std::vector<const Type*>FuncTy_18_args;
-  FuncTy_18_args.push_back(PointerTy_12);
-  FuncTy_18_args.push_back(PointerTy_12);
+  FuncTy_18_args.push_back(PointerTy_9);
+  FuncTy_18_args.push_back(PointerTy_9);
   FuncTy_18_args.push_back(PointerTy_7);
   FuncTy_18_args.push_back(IntegerType::get(32));
   FuncTy_18_args.push_back(IntegerType::get(32));
@@ -526,14 +526,23 @@ Module* createBaseShader() {
     BasicBlock* label_forbody_preheader_i = new BasicBlock("forbody.preheader.i",func_run_vertex_shader,0);
     BasicBlock* label_forbody_i = new BasicBlock("forbody.i",func_run_vertex_shader,0);
     BasicBlock* label_from_consts_exit = new BasicBlock("from_consts.exit",func_run_vertex_shader,0);
-    BasicBlock* label_forbody_preheader_91 = new BasicBlock("forbody.preheader",func_run_vertex_shader,0);
-    BasicBlock* label_forbody_92 = new BasicBlock("forbody",func_run_vertex_shader,0);
-    BasicBlock* label_afterfor_93 = new BasicBlock("afterfor",func_run_vertex_shader,0);
     
     // Block entry (label_entry_90)
     AllocaInst* ptr_consts = new AllocaInst(ArrayTy_20, "consts", label_entry_90);
     AllocaInst* ptr_temps = new AllocaInst(ArrayTy_22, "temps", label_entry_90);
     AllocaInst* ptr_args = new AllocaInst(StructTy_struct_ShaderInput, "args", label_entry_90);
+    std::vector<Value*> ptr_tmp_indices;
+    ptr_tmp_indices.push_back(const_int32_29);
+    ptr_tmp_indices.push_back(const_int32_29);
+    Instruction* ptr_tmp = new GetElementPtrInst(ptr_args, ptr_tmp_indices.begin(), ptr_tmp_indices.end(), "tmp", label_entry_90);
+    CastInst* ptr_conv = new BitCastInst(ptr_results, PointerTy_0, "conv", label_entry_90);
+    StoreInst* void_91 = new StoreInst(ptr_conv, ptr_tmp, false, label_entry_90);
+    std::vector<Value*> ptr_tmp2_indices;
+    ptr_tmp2_indices.push_back(const_int32_29);
+    ptr_tmp2_indices.push_back(const_int32_31);
+    Instruction* ptr_tmp2 = new GetElementPtrInst(ptr_args, ptr_tmp2_indices.begin(), ptr_tmp2_indices.end(), "tmp2", label_entry_90);
+    CastInst* ptr_conv4 = new BitCastInst(ptr_inputs, PointerTy_0, "conv4", label_entry_90);
+    StoreInst* void_92 = new StoreInst(ptr_conv4, ptr_tmp2, false, label_entry_90);
     ICmpInst* int1_cmp_i = new ICmpInst(ICmpInst::ICMP_SGT, int32_num_consts, const_int32_29, "cmp.i", label_entry_90);
     new BranchInst(label_forbody_preheader_i, label_from_consts_exit, int1_cmp_i, label_entry_90);
     
@@ -544,17 +553,17 @@ Module* createBaseShader() {
     new BranchInst(label_forbody_i, label_forbody_preheader_i);
     
     // Block forbody.i (label_forbody_i)
-    Argument* fwdref_96 = new Argument(IntegerType::get(32));
+    Argument* fwdref_95 = new Argument(IntegerType::get(32));
     PHINode* int32_i_0_reg2mem_0_i = new PHINode(IntegerType::get(32), "i.0.reg2mem.0.i", label_forbody_i);
     int32_i_0_reg2mem_0_i->reserveOperandSpace(2);
     int32_i_0_reg2mem_0_i->addIncoming(const_int32_29, label_forbody_preheader_i);
-    int32_i_0_reg2mem_0_i->addIncoming(fwdref_96, label_forbody_i);
+    int32_i_0_reg2mem_0_i->addIncoming(fwdref_95, label_forbody_i);
     
-    Argument* fwdref_97 = new Argument(VectorTy_1);
+    Argument* fwdref_96 = new Argument(VectorTy_1);
     PHINode* packed_vec_0_reg2mem_0_i = new PHINode(VectorTy_1, "vec.0.reg2mem.0.i", label_forbody_i);
     packed_vec_0_reg2mem_0_i->reserveOperandSpace(2);
     packed_vec_0_reg2mem_0_i->addIncoming(const_packed_32, label_forbody_preheader_i);
-    packed_vec_0_reg2mem_0_i->addIncoming(fwdref_97, label_forbody_i);
+    packed_vec_0_reg2mem_0_i->addIncoming(fwdref_96, label_forbody_i);
     
     std::vector<Value*> ptr_arraydecay_i_indices;
     ptr_arraydecay_i_indices.push_back(int32_i_0_reg2mem_0_i);
@@ -584,80 +593,40 @@ Module* createBaseShader() {
     ptr_arrayidx34_i_indices.push_back(const_int32_29);
     ptr_arrayidx34_i_indices.push_back(int32_i_0_reg2mem_0_i);
     Instruction* ptr_arrayidx34_i = new GetElementPtrInst(ptr_consts, ptr_arrayidx34_i_indices.begin(), ptr_arrayidx34_i_indices.end(), "arrayidx34.i", label_forbody_i);
-    StoreInst* void_98 = new StoreInst(packed_tmp31_i, ptr_arrayidx34_i, false, label_forbody_i);
-    BinaryOperator* int32_indvar_next8 = BinaryOperator::create(Instruction::Add, int32_i_0_reg2mem_0_i, const_int32_31, "indvar.next8", label_forbody_i);
-    ICmpInst* int1_exitcond9 = new ICmpInst(ICmpInst::ICMP_EQ, int32_indvar_next8, int32_tmp10_i, "exitcond9", label_forbody_i);
-    new BranchInst(label_from_consts_exit, label_forbody_i, int1_exitcond9, label_forbody_i);
+    StoreInst* void_97 = new StoreInst(packed_tmp31_i, ptr_arrayidx34_i, false, label_forbody_i);
+    BinaryOperator* int32_indvar_next_98 = BinaryOperator::create(Instruction::Add, int32_i_0_reg2mem_0_i, const_int32_31, "indvar.next", label_forbody_i);
+    ICmpInst* int1_exitcond_99 = new ICmpInst(ICmpInst::ICMP_EQ, int32_indvar_next_98, int32_tmp10_i, "exitcond", label_forbody_i);
+    new BranchInst(label_from_consts_exit, label_forbody_i, int1_exitcond_99, label_forbody_i);
     
     // Block from_consts.exit (label_from_consts_exit)
-    std::vector<Value*> ptr_tmp2_indices;
-    ptr_tmp2_indices.push_back(const_int32_29);
-    ptr_tmp2_indices.push_back(const_int32_34);
-    Instruction* ptr_tmp2 = new GetElementPtrInst(ptr_args, ptr_tmp2_indices.begin(), ptr_tmp2_indices.end(), "tmp2", label_from_consts_exit);
-    std::vector<Value*> ptr_arraydecay3_indices;
-    ptr_arraydecay3_indices.push_back(const_int32_29);
-    ptr_arraydecay3_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay3 = new GetElementPtrInst(ptr_consts, ptr_arraydecay3_indices.begin(), ptr_arraydecay3_indices.end(), "arraydecay3", label_from_consts_exit);
-    StoreInst* void_100 = new StoreInst(ptr_arraydecay3, ptr_tmp2, false, label_from_consts_exit);
-    std::vector<Value*> ptr_tmp4_indices;
-    ptr_tmp4_indices.push_back(const_int32_29);
-    ptr_tmp4_indices.push_back(const_int32_33);
-    Instruction* ptr_tmp4 = new GetElementPtrInst(ptr_args, ptr_tmp4_indices.begin(), ptr_tmp4_indices.end(), "tmp4", label_from_consts_exit);
-    std::vector<Value*> ptr_arraydecay5_indices;
-    ptr_arraydecay5_indices.push_back(const_int32_29);
-    ptr_arraydecay5_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay5 = new GetElementPtrInst(ptr_temps, ptr_arraydecay5_indices.begin(), ptr_arraydecay5_indices.end(), "arraydecay5", label_from_consts_exit);
-    StoreInst* void_101 = new StoreInst(ptr_arraydecay5, ptr_tmp4, false, label_from_consts_exit);
-    ICmpInst* int1_cmp_102 = new ICmpInst(ICmpInst::ICMP_SGT, int32_num_vertices, const_int32_29, "cmp", label_from_consts_exit);
-    new BranchInst(label_forbody_preheader_91, label_afterfor_93, int1_cmp_102, label_from_consts_exit);
-    
-    // Block forbody.preheader (label_forbody_preheader_91)
-    std::vector<Value*> ptr_tmp8_indices;
-    ptr_tmp8_indices.push_back(const_int32_29);
-    ptr_tmp8_indices.push_back(const_int32_29);
-    Instruction* ptr_tmp8 = new GetElementPtrInst(ptr_args, ptr_tmp8_indices.begin(), ptr_tmp8_indices.end(), "tmp8", label_forbody_preheader_91);
-    std::vector<Value*> ptr_tmp12_indices;
-    ptr_tmp12_indices.push_back(const_int32_29);
-    ptr_tmp12_indices.push_back(const_int32_31);
-    Instruction* ptr_tmp12 = new GetElementPtrInst(ptr_args, ptr_tmp12_indices.begin(), ptr_tmp12_indices.end(), "tmp12", label_forbody_preheader_91);
-    BinaryOperator* int32_tmp_104 = BinaryOperator::create(Instruction::Add, int32_num_vertices, const_int32_30, "tmp", label_forbody_preheader_91);
-    ICmpInst* int1_tmp6 = new ICmpInst(ICmpInst::ICMP_SLT, int32_tmp_104, const_int32_29, "tmp6", label_forbody_preheader_91);
-    SelectInst* int32_tmp7 = new SelectInst(int1_tmp6, const_int32_31, int32_num_vertices, "tmp7", label_forbody_preheader_91);
-    new BranchInst(label_forbody_92, label_forbody_preheader_91);
-    
-    // Block forbody (label_forbody_92)
-    Argument* fwdref_107 = new Argument(IntegerType::get(32));
-    PHINode* int32_i_0_reg2mem_0_106 = new PHINode(IntegerType::get(32), "i.0.reg2mem.0", label_forbody_92);
-    int32_i_0_reg2mem_0_106->reserveOperandSpace(2);
-    int32_i_0_reg2mem_0_106->addIncoming(const_int32_29, label_forbody_preheader_91);
-    int32_i_0_reg2mem_0_106->addIncoming(fwdref_107, label_forbody_92);
-    
-    std::vector<Value*> ptr_arraydecay11_108_indices;
-    ptr_arraydecay11_108_indices.push_back(int32_i_0_reg2mem_0_106);
-    ptr_arraydecay11_108_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay11_108 = new GetElementPtrInst(ptr_results, ptr_arraydecay11_108_indices.begin(), ptr_arraydecay11_108_indices.end(), "arraydecay11", label_forbody_92);
-    StoreInst* void_109 = new StoreInst(ptr_arraydecay11_108, ptr_tmp8, false, label_forbody_92);
-    std::vector<Value*> ptr_arraydecay16_indices;
-    ptr_arraydecay16_indices.push_back(int32_i_0_reg2mem_0_106);
-    ptr_arraydecay16_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay16 = new GetElementPtrInst(ptr_inputs, ptr_arraydecay16_indices.begin(), ptr_arraydecay16_indices.end(), "arraydecay16", label_forbody_92);
-    StoreInst* void_110 = new StoreInst(ptr_arraydecay16, ptr_tmp12, false, label_forbody_92);
-    CallInst* void_111 = new CallInst(func_execute_shader, ptr_args, "", label_forbody_92);
-    void_111->setCallingConv(CallingConv::C);
-    void_111->setTailCall(false);const ParamAttrsList *void_111_PAL = 0;
-    void_111->setParamAttrs(void_111_PAL);
-    
-    BinaryOperator* int32_indvar_next_112 = BinaryOperator::create(Instruction::Add, int32_i_0_reg2mem_0_106, const_int32_31, "indvar.next", label_forbody_92);
-    ICmpInst* int1_exitcond_113 = new ICmpInst(ICmpInst::ICMP_EQ, int32_indvar_next_112, int32_tmp7, "exitcond", label_forbody_92);
-    new BranchInst(label_afterfor_93, label_forbody_92, int1_exitcond_113, label_forbody_92);
-    
-    // Block afterfor (label_afterfor_93)
-    new ReturnInst(label_afterfor_93);
+    std::vector<Value*> ptr_tmp7_indices;
+    ptr_tmp7_indices.push_back(const_int32_29);
+    ptr_tmp7_indices.push_back(const_int32_34);
+    Instruction* ptr_tmp7 = new GetElementPtrInst(ptr_args, ptr_tmp7_indices.begin(), ptr_tmp7_indices.end(), "tmp7", label_from_consts_exit);
+    std::vector<Value*> ptr_arraydecay8_indices;
+    ptr_arraydecay8_indices.push_back(const_int32_29);
+    ptr_arraydecay8_indices.push_back(const_int32_29);
+    Instruction* ptr_arraydecay8 = new GetElementPtrInst(ptr_consts, ptr_arraydecay8_indices.begin(), ptr_arraydecay8_indices.end(), "arraydecay8", label_from_consts_exit);
+    StoreInst* void_101 = new StoreInst(ptr_arraydecay8, ptr_tmp7, false, label_from_consts_exit);
+    std::vector<Value*> ptr_tmp9_indices;
+    ptr_tmp9_indices.push_back(const_int32_29);
+    ptr_tmp9_indices.push_back(const_int32_33);
+    Instruction* ptr_tmp9 = new GetElementPtrInst(ptr_args, ptr_tmp9_indices.begin(), ptr_tmp9_indices.end(), "tmp9", label_from_consts_exit);
+    std::vector<Value*> ptr_arraydecay10_indices;
+    ptr_arraydecay10_indices.push_back(const_int32_29);
+    ptr_arraydecay10_indices.push_back(const_int32_29);
+    Instruction* ptr_arraydecay10 = new GetElementPtrInst(ptr_temps, ptr_arraydecay10_indices.begin(), ptr_arraydecay10_indices.end(), "arraydecay10", label_from_consts_exit);
+    StoreInst* void_102 = new StoreInst(ptr_arraydecay10, ptr_tmp9, false, label_from_consts_exit);
+    CallInst* void_103 = new CallInst(func_execute_shader, ptr_args, "", label_from_consts_exit);
+    void_103->setCallingConv(CallingConv::C);
+    void_103->setTailCall(false);const ParamAttrsList *void_103_PAL = 0;
+    void_103->setParamAttrs(void_103_PAL);
+    
+    new ReturnInst(label_from_consts_exit);
     
     // Resolve Forward References
-    fwdref_107->replaceAllUsesWith(int32_indvar_next_112); delete fwdref_107;
-    fwdref_97->replaceAllUsesWith(packed_tmp31_i); delete fwdref_97;
-    fwdref_96->replaceAllUsesWith(int32_indvar_next8); delete fwdref_96;
+    fwdref_96->replaceAllUsesWith(packed_tmp31_i); delete fwdref_96;
+    fwdref_95->replaceAllUsesWith(int32_indvar_next_98); delete fwdref_95;
     
   }
   
@@ -668,195 +637,195 @@ Module* createBaseShader() {
     float_x->setName("x");
     Value* float_y = args++;
     float_y->setName("y");
-    Value* ptr_results_116 = args++;
-    ptr_results_116->setName("results");
-    Value* ptr_inputs_117 = args++;
-    ptr_inputs_117->setName("inputs");
-    Value* int32_num_inputs_118 = args++;
-    int32_num_inputs_118->setName("num_inputs");
-    Value* ptr_aconsts_119 = args++;
-    ptr_aconsts_119->setName("aconsts");
-    Value* int32_num_consts_120 = args++;
-    int32_num_consts_120->setName("num_consts");
+    Value* ptr_results_105 = args++;
+    ptr_results_105->setName("results");
+    Value* ptr_inputs_106 = args++;
+    ptr_inputs_106->setName("inputs");
+    Value* int32_num_inputs_107 = args++;
+    int32_num_inputs_107->setName("num_inputs");
+    Value* ptr_aconsts_108 = args++;
+    ptr_aconsts_108->setName("aconsts");
+    Value* int32_num_consts_109 = args++;
+    int32_num_consts_109->setName("num_consts");
     Value* ptr_samplers = args++;
     ptr_samplers->setName("samplers");
     
-    BasicBlock* label_entry_121 = new BasicBlock("entry",func_run_fragment_shader,0);
-    BasicBlock* label_forbody_preheader_i_122 = new BasicBlock("forbody.preheader.i",func_run_fragment_shader,0);
-    BasicBlock* label_forbody_i_123 = new BasicBlock("forbody.i",func_run_fragment_shader,0);
-    BasicBlock* label_from_consts_exit_124 = new BasicBlock("from_consts.exit",func_run_fragment_shader,0);
-    
-    // Block entry (label_entry_121)
-    AllocaInst* ptr_consts_125 = new AllocaInst(ArrayTy_20, "consts", label_entry_121);
-    AllocaInst* ptr_temps_126 = new AllocaInst(ArrayTy_22, "temps", label_entry_121);
-    AllocaInst* ptr_args_127 = new AllocaInst(StructTy_struct_ShaderInput, "args", label_entry_121);
-    std::vector<Value*> ptr_tmp_indices;
-    ptr_tmp_indices.push_back(const_int32_29);
-    ptr_tmp_indices.push_back(const_int32_35);
-    Instruction* ptr_tmp = new GetElementPtrInst(ptr_args_127, ptr_tmp_indices.begin(), ptr_tmp_indices.end(), "tmp", label_entry_121);
-    StoreInst* void_128 = new StoreInst(const_int32_29, ptr_tmp, false, label_entry_121);
-    ICmpInst* int1_cmp_i_129 = new ICmpInst(ICmpInst::ICMP_SGT, int32_num_consts_120, const_int32_29, "cmp.i", label_entry_121);
-    new BranchInst(label_forbody_preheader_i_122, label_from_consts_exit_124, int1_cmp_i_129, label_entry_121);
-    
-    // Block forbody.preheader.i (label_forbody_preheader_i_122)
-    BinaryOperator* int32_tmp_i_131 = BinaryOperator::create(Instruction::Add, int32_num_consts_120, const_int32_30, "tmp.i", label_forbody_preheader_i_122);
-    ICmpInst* int1_tmp9_i_132 = new ICmpInst(ICmpInst::ICMP_SLT, int32_tmp_i_131, const_int32_29, "tmp9.i", label_forbody_preheader_i_122);
-    SelectInst* int32_tmp10_i_133 = new SelectInst(int1_tmp9_i_132, const_int32_31, int32_num_consts_120, "tmp10.i", label_forbody_preheader_i_122);
-    new BranchInst(label_forbody_i_123, label_forbody_preheader_i_122);
-    
-    // Block forbody.i (label_forbody_i_123)
-    Argument* fwdref_136 = new Argument(IntegerType::get(32));
-    PHINode* int32_i_0_reg2mem_0_i_135 = new PHINode(IntegerType::get(32), "i.0.reg2mem.0.i", label_forbody_i_123);
-    int32_i_0_reg2mem_0_i_135->reserveOperandSpace(2);
-    int32_i_0_reg2mem_0_i_135->addIncoming(const_int32_29, label_forbody_preheader_i_122);
-    int32_i_0_reg2mem_0_i_135->addIncoming(fwdref_136, label_forbody_i_123);
-    
-    Argument* fwdref_138 = new Argument(VectorTy_1);
-    PHINode* packed_vec_0_reg2mem_0_i_137 = new PHINode(VectorTy_1, "vec.0.reg2mem.0.i", label_forbody_i_123);
-    packed_vec_0_reg2mem_0_i_137->reserveOperandSpace(2);
-    packed_vec_0_reg2mem_0_i_137->addIncoming(const_packed_32, label_forbody_preheader_i_122);
-    packed_vec_0_reg2mem_0_i_137->addIncoming(fwdref_138, label_forbody_i_123);
-    
-    std::vector<Value*> ptr_arraydecay_i_139_indices;
-    ptr_arraydecay_i_139_indices.push_back(int32_i_0_reg2mem_0_i_135);
-    ptr_arraydecay_i_139_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay_i_139 = new GetElementPtrInst(ptr_aconsts_119, ptr_arraydecay_i_139_indices.begin(), ptr_arraydecay_i_139_indices.end(), "arraydecay.i", label_forbody_i_123);
-    LoadInst* float_tmp5_i_140 = new LoadInst(ptr_arraydecay_i_139, "tmp5.i", false, label_forbody_i_123);
-    InsertElementInst* packed_tmp7_i_141 = new InsertElementInst(packed_vec_0_reg2mem_0_i_137, float_tmp5_i_140, const_int32_29, "tmp7.i", label_forbody_i_123);
-    std::vector<Value*> ptr_arrayidx12_i_142_indices;
-    ptr_arrayidx12_i_142_indices.push_back(int32_i_0_reg2mem_0_i_135);
-    ptr_arrayidx12_i_142_indices.push_back(const_int32_31);
-    Instruction* ptr_arrayidx12_i_142 = new GetElementPtrInst(ptr_aconsts_119, ptr_arrayidx12_i_142_indices.begin(), ptr_arrayidx12_i_142_indices.end(), "arrayidx12.i", label_forbody_i_123);
-    LoadInst* float_tmp13_i_143 = new LoadInst(ptr_arrayidx12_i_142, "tmp13.i", false, label_forbody_i_123);
-    InsertElementInst* packed_tmp15_i_144 = new InsertElementInst(packed_tmp7_i_141, float_tmp13_i_143, const_int32_31, "tmp15.i", label_forbody_i_123);
-    std::vector<Value*> ptr_arrayidx20_i_145_indices;
-    ptr_arrayidx20_i_145_indices.push_back(int32_i_0_reg2mem_0_i_135);
-    ptr_arrayidx20_i_145_indices.push_back(const_int32_33);
-    Instruction* ptr_arrayidx20_i_145 = new GetElementPtrInst(ptr_aconsts_119, ptr_arrayidx20_i_145_indices.begin(), ptr_arrayidx20_i_145_indices.end(), "arrayidx20.i", label_forbody_i_123);
-    LoadInst* float_tmp21_i_146 = new LoadInst(ptr_arrayidx20_i_145, "tmp21.i", false, label_forbody_i_123);
-    InsertElementInst* packed_tmp23_i_147 = new InsertElementInst(packed_tmp15_i_144, float_tmp21_i_146, const_int32_33, "tmp23.i", label_forbody_i_123);
-    std::vector<Value*> ptr_arrayidx28_i_148_indices;
-    ptr_arrayidx28_i_148_indices.push_back(int32_i_0_reg2mem_0_i_135);
-    ptr_arrayidx28_i_148_indices.push_back(const_int32_34);
-    Instruction* ptr_arrayidx28_i_148 = new GetElementPtrInst(ptr_aconsts_119, ptr_arrayidx28_i_148_indices.begin(), ptr_arrayidx28_i_148_indices.end(), "arrayidx28.i", label_forbody_i_123);
-    LoadInst* float_tmp29_i_149 = new LoadInst(ptr_arrayidx28_i_148, "tmp29.i", false, label_forbody_i_123);
-    InsertElementInst* packed_tmp31_i_150 = new InsertElementInst(packed_tmp23_i_147, float_tmp29_i_149, const_int32_34, "tmp31.i", label_forbody_i_123);
-    std::vector<Value*> ptr_arrayidx34_i_151_indices;
-    ptr_arrayidx34_i_151_indices.push_back(const_int32_29);
-    ptr_arrayidx34_i_151_indices.push_back(int32_i_0_reg2mem_0_i_135);
-    Instruction* ptr_arrayidx34_i_151 = new GetElementPtrInst(ptr_consts_125, ptr_arrayidx34_i_151_indices.begin(), ptr_arrayidx34_i_151_indices.end(), "arrayidx34.i", label_forbody_i_123);
-    StoreInst* void_152 = new StoreInst(packed_tmp31_i_150, ptr_arrayidx34_i_151, false, label_forbody_i_123);
-    BinaryOperator* int32_indvar_next7 = BinaryOperator::create(Instruction::Add, int32_i_0_reg2mem_0_i_135, const_int32_31, "indvar.next7", label_forbody_i_123);
-    ICmpInst* int1_exitcond8 = new ICmpInst(ICmpInst::ICMP_EQ, int32_indvar_next7, int32_tmp10_i_133, "exitcond8", label_forbody_i_123);
-    new BranchInst(label_from_consts_exit_124, label_forbody_i_123, int1_exitcond8, label_forbody_i_123);
-    
-    // Block from_consts.exit (label_from_consts_exit_124)
+    BasicBlock* label_entry_110 = new BasicBlock("entry",func_run_fragment_shader,0);
+    BasicBlock* label_forbody_preheader_i_111 = new BasicBlock("forbody.preheader.i",func_run_fragment_shader,0);
+    BasicBlock* label_forbody_i_112 = new BasicBlock("forbody.i",func_run_fragment_shader,0);
+    BasicBlock* label_from_consts_exit_113 = new BasicBlock("from_consts.exit",func_run_fragment_shader,0);
+    
+    // Block entry (label_entry_110)
+    AllocaInst* ptr_consts_114 = new AllocaInst(ArrayTy_20, "consts", label_entry_110);
+    AllocaInst* ptr_temps_115 = new AllocaInst(ArrayTy_22, "temps", label_entry_110);
+    AllocaInst* ptr_args_116 = new AllocaInst(StructTy_struct_ShaderInput, "args", label_entry_110);
+    std::vector<Value*> ptr_tmp_117_indices;
+    ptr_tmp_117_indices.push_back(const_int32_29);
+    ptr_tmp_117_indices.push_back(const_int32_35);
+    Instruction* ptr_tmp_117 = new GetElementPtrInst(ptr_args_116, ptr_tmp_117_indices.begin(), ptr_tmp_117_indices.end(), "tmp", label_entry_110);
+    StoreInst* void_118 = new StoreInst(const_int32_29, ptr_tmp_117, false, label_entry_110);
+    ICmpInst* int1_cmp_i_119 = new ICmpInst(ICmpInst::ICMP_SGT, int32_num_consts_109, const_int32_29, "cmp.i", label_entry_110);
+    new BranchInst(label_forbody_preheader_i_111, label_from_consts_exit_113, int1_cmp_i_119, label_entry_110);
+    
+    // Block forbody.preheader.i (label_forbody_preheader_i_111)
+    BinaryOperator* int32_tmp_i_121 = BinaryOperator::create(Instruction::Add, int32_num_consts_109, const_int32_30, "tmp.i", label_forbody_preheader_i_111);
+    ICmpInst* int1_tmp9_i_122 = new ICmpInst(ICmpInst::ICMP_SLT, int32_tmp_i_121, const_int32_29, "tmp9.i", label_forbody_preheader_i_111);
+    SelectInst* int32_tmp10_i_123 = new SelectInst(int1_tmp9_i_122, const_int32_31, int32_num_consts_109, "tmp10.i", label_forbody_preheader_i_111);
+    new BranchInst(label_forbody_i_112, label_forbody_preheader_i_111);
+    
+    // Block forbody.i (label_forbody_i_112)
+    Argument* fwdref_126 = new Argument(IntegerType::get(32));
+    PHINode* int32_i_0_reg2mem_0_i_125 = new PHINode(IntegerType::get(32), "i.0.reg2mem.0.i", label_forbody_i_112);
+    int32_i_0_reg2mem_0_i_125->reserveOperandSpace(2);
+    int32_i_0_reg2mem_0_i_125->addIncoming(const_int32_29, label_forbody_preheader_i_111);
+    int32_i_0_reg2mem_0_i_125->addIncoming(fwdref_126, label_forbody_i_112);
+    
+    Argument* fwdref_128 = new Argument(VectorTy_1);
+    PHINode* packed_vec_0_reg2mem_0_i_127 = new PHINode(VectorTy_1, "vec.0.reg2mem.0.i", label_forbody_i_112);
+    packed_vec_0_reg2mem_0_i_127->reserveOperandSpace(2);
+    packed_vec_0_reg2mem_0_i_127->addIncoming(const_packed_32, label_forbody_preheader_i_111);
+    packed_vec_0_reg2mem_0_i_127->addIncoming(fwdref_128, label_forbody_i_112);
+    
+    std::vector<Value*> ptr_arraydecay_i_129_indices;
+    ptr_arraydecay_i_129_indices.push_back(int32_i_0_reg2mem_0_i_125);
+    ptr_arraydecay_i_129_indices.push_back(const_int32_29);
+    Instruction* ptr_arraydecay_i_129 = new GetElementPtrInst(ptr_aconsts_108, ptr_arraydecay_i_129_indices.begin(), ptr_arraydecay_i_129_indices.end(), "arraydecay.i", label_forbody_i_112);
+    LoadInst* float_tmp5_i_130 = new LoadInst(ptr_arraydecay_i_129, "tmp5.i", false, label_forbody_i_112);
+    InsertElementInst* packed_tmp7_i_131 = new InsertElementInst(packed_vec_0_reg2mem_0_i_127, float_tmp5_i_130, const_int32_29, "tmp7.i", label_forbody_i_112);
+    std::vector<Value*> ptr_arrayidx12_i_132_indices;
+    ptr_arrayidx12_i_132_indices.push_back(int32_i_0_reg2mem_0_i_125);
+    ptr_arrayidx12_i_132_indices.push_back(const_int32_31);
+    Instruction* ptr_arrayidx12_i_132 = new GetElementPtrInst(ptr_aconsts_108, ptr_arrayidx12_i_132_indices.begin(), ptr_arrayidx12_i_132_indices.end(), "arrayidx12.i", label_forbody_i_112);
+    LoadInst* float_tmp13_i_133 = new LoadInst(ptr_arrayidx12_i_132, "tmp13.i", false, label_forbody_i_112);
+    InsertElementInst* packed_tmp15_i_134 = new InsertElementInst(packed_tmp7_i_131, float_tmp13_i_133, const_int32_31, "tmp15.i", label_forbody_i_112);
+    std::vector<Value*> ptr_arrayidx20_i_135_indices;
+    ptr_arrayidx20_i_135_indices.push_back(int32_i_0_reg2mem_0_i_125);
+    ptr_arrayidx20_i_135_indices.push_back(const_int32_33);
+    Instruction* ptr_arrayidx20_i_135 = new GetElementPtrInst(ptr_aconsts_108, ptr_arrayidx20_i_135_indices.begin(), ptr_arrayidx20_i_135_indices.end(), "arrayidx20.i", label_forbody_i_112);
+    LoadInst* float_tmp21_i_136 = new LoadInst(ptr_arrayidx20_i_135, "tmp21.i", false, label_forbody_i_112);
+    InsertElementInst* packed_tmp23_i_137 = new InsertElementInst(packed_tmp15_i_134, float_tmp21_i_136, const_int32_33, "tmp23.i", label_forbody_i_112);
+    std::vector<Value*> ptr_arrayidx28_i_138_indices;
+    ptr_arrayidx28_i_138_indices.push_back(int32_i_0_reg2mem_0_i_125);
+    ptr_arrayidx28_i_138_indices.push_back(const_int32_34);
+    Instruction* ptr_arrayidx28_i_138 = new GetElementPtrInst(ptr_aconsts_108, ptr_arrayidx28_i_138_indices.begin(), ptr_arrayidx28_i_138_indices.end(), "arrayidx28.i", label_forbody_i_112);
+    LoadInst* float_tmp29_i_139 = new LoadInst(ptr_arrayidx28_i_138, "tmp29.i", false, label_forbody_i_112);
+    InsertElementInst* packed_tmp31_i_140 = new InsertElementInst(packed_tmp23_i_137, float_tmp29_i_139, const_int32_34, "tmp31.i", label_forbody_i_112);
+    std::vector<Value*> ptr_arrayidx34_i_141_indices;
+    ptr_arrayidx34_i_141_indices.push_back(const_int32_29);
+    ptr_arrayidx34_i_141_indices.push_back(int32_i_0_reg2mem_0_i_125);
+    Instruction* ptr_arrayidx34_i_141 = new GetElementPtrInst(ptr_consts_114, ptr_arrayidx34_i_141_indices.begin(), ptr_arrayidx34_i_141_indices.end(), "arrayidx34.i", label_forbody_i_112);
+    StoreInst* void_142 = new StoreInst(packed_tmp31_i_140, ptr_arrayidx34_i_141, false, label_forbody_i_112);
+    BinaryOperator* int32_indvar_next7 = BinaryOperator::create(Instruction::Add, int32_i_0_reg2mem_0_i_125, const_int32_31, "indvar.next7", label_forbody_i_112);
+    ICmpInst* int1_exitcond8 = new ICmpInst(ICmpInst::ICMP_EQ, int32_indvar_next7, int32_tmp10_i_123, "exitcond8", label_forbody_i_112);
+    new BranchInst(label_from_consts_exit_113, label_forbody_i_112, int1_exitcond8, label_forbody_i_112);
+    
+    // Block from_consts.exit (label_from_consts_exit_113)
     std::vector<Value*> ptr_tmp3_indices;
     ptr_tmp3_indices.push_back(const_int32_29);
     ptr_tmp3_indices.push_back(const_int32_34);
-    Instruction* ptr_tmp3 = new GetElementPtrInst(ptr_args_127, ptr_tmp3_indices.begin(), ptr_tmp3_indices.end(), "tmp3", label_from_consts_exit_124);
+    Instruction* ptr_tmp3 = new GetElementPtrInst(ptr_args_116, ptr_tmp3_indices.begin(), ptr_tmp3_indices.end(), "tmp3", label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay4_indices;
     ptr_arraydecay4_indices.push_back(const_int32_29);
     ptr_arraydecay4_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay4 = new GetElementPtrInst(ptr_consts_125, ptr_arraydecay4_indices.begin(), ptr_arraydecay4_indices.end(), "arraydecay4", label_from_consts_exit_124);
-    StoreInst* void_154 = new StoreInst(ptr_arraydecay4, ptr_tmp3, false, label_from_consts_exit_124);
+    Instruction* ptr_arraydecay4 = new GetElementPtrInst(ptr_consts_114, ptr_arraydecay4_indices.begin(), ptr_arraydecay4_indices.end(), "arraydecay4", label_from_consts_exit_113);
+    StoreInst* void_144 = new StoreInst(ptr_arraydecay4, ptr_tmp3, false, label_from_consts_exit_113);
     std::vector<Value*> ptr_tmp5_indices;
     ptr_tmp5_indices.push_back(const_int32_29);
     ptr_tmp5_indices.push_back(const_int32_33);
-    Instruction* ptr_tmp5 = new GetElementPtrInst(ptr_args_127, ptr_tmp5_indices.begin(), ptr_tmp5_indices.end(), "tmp5", label_from_consts_exit_124);
+    Instruction* ptr_tmp5 = new GetElementPtrInst(ptr_args_116, ptr_tmp5_indices.begin(), ptr_tmp5_indices.end(), "tmp5", label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay6_indices;
     ptr_arraydecay6_indices.push_back(const_int32_29);
     ptr_arraydecay6_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay6 = new GetElementPtrInst(ptr_temps_126, ptr_arraydecay6_indices.begin(), ptr_arraydecay6_indices.end(), "arraydecay6", label_from_consts_exit_124);
-    StoreInst* void_155 = new StoreInst(ptr_arraydecay6, ptr_tmp5, false, label_from_consts_exit_124);
-    std::vector<Value*> ptr_tmp8_156_indices;
-    ptr_tmp8_156_indices.push_back(const_int32_29);
-    ptr_tmp8_156_indices.push_back(const_int32_31);
-    Instruction* ptr_tmp8_156 = new GetElementPtrInst(ptr_args_127, ptr_tmp8_156_indices.begin(), ptr_tmp8_156_indices.end(), "tmp8", label_from_consts_exit_124);
-    std::vector<Value*> ptr_tmp12_157_indices;
-    ptr_tmp12_157_indices.push_back(const_int32_29);
-    ptr_tmp12_157_indices.push_back(const_int32_29);
-    Instruction* ptr_tmp12_157 = new GetElementPtrInst(ptr_args_127, ptr_tmp12_157_indices.begin(), ptr_tmp12_157_indices.end(), "tmp12", label_from_consts_exit_124);
-    std::vector<Value*> ptr_arraydecay11_158_indices;
-    ptr_arraydecay11_158_indices.push_back(const_int32_29);
-    ptr_arraydecay11_158_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay11_158 = new GetElementPtrInst(ptr_inputs_117, ptr_arraydecay11_158_indices.begin(), ptr_arraydecay11_158_indices.end(), "arraydecay11", label_from_consts_exit_124);
-    StoreInst* void_159 = new StoreInst(ptr_arraydecay11_158, ptr_tmp8_156, false, label_from_consts_exit_124);
-    std::vector<Value*> ptr_arraydecay16_160_indices;
-    ptr_arraydecay16_160_indices.push_back(const_int32_29);
-    ptr_arraydecay16_160_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay16_160 = new GetElementPtrInst(ptr_results_116, ptr_arraydecay16_160_indices.begin(), ptr_arraydecay16_160_indices.end(), "arraydecay16", label_from_consts_exit_124);
-    StoreInst* void_161 = new StoreInst(ptr_arraydecay16_160, ptr_tmp12_157, false, label_from_consts_exit_124);
-    StoreInst* void_162 = new StoreInst(const_int32_29, ptr_tmp, false, label_from_consts_exit_124);
-    CallInst* void_163 = new CallInst(func_execute_shader, ptr_args_127, "", label_from_consts_exit_124);
-    void_163->setCallingConv(CallingConv::C);
-    void_163->setTailCall(false);const ParamAttrsList *void_163_PAL = 0;
-    void_163->setParamAttrs(void_163_PAL);
-    
-    LoadInst* int32_tmp23 = new LoadInst(ptr_tmp, "tmp23", false, label_from_consts_exit_124);
+    Instruction* ptr_arraydecay6 = new GetElementPtrInst(ptr_temps_115, ptr_arraydecay6_indices.begin(), ptr_arraydecay6_indices.end(), "arraydecay6", label_from_consts_exit_113);
+    StoreInst* void_145 = new StoreInst(ptr_arraydecay6, ptr_tmp5, false, label_from_consts_exit_113);
+    std::vector<Value*> ptr_tmp8_indices;
+    ptr_tmp8_indices.push_back(const_int32_29);
+    ptr_tmp8_indices.push_back(const_int32_31);
+    Instruction* ptr_tmp8 = new GetElementPtrInst(ptr_args_116, ptr_tmp8_indices.begin(), ptr_tmp8_indices.end(), "tmp8", label_from_consts_exit_113);
+    std::vector<Value*> ptr_tmp12_indices;
+    ptr_tmp12_indices.push_back(const_int32_29);
+    ptr_tmp12_indices.push_back(const_int32_29);
+    Instruction* ptr_tmp12 = new GetElementPtrInst(ptr_args_116, ptr_tmp12_indices.begin(), ptr_tmp12_indices.end(), "tmp12", label_from_consts_exit_113);
+    std::vector<Value*> ptr_arraydecay11_146_indices;
+    ptr_arraydecay11_146_indices.push_back(const_int32_29);
+    ptr_arraydecay11_146_indices.push_back(const_int32_29);
+    Instruction* ptr_arraydecay11_146 = new GetElementPtrInst(ptr_inputs_106, ptr_arraydecay11_146_indices.begin(), ptr_arraydecay11_146_indices.end(), "arraydecay11", label_from_consts_exit_113);
+    StoreInst* void_147 = new StoreInst(ptr_arraydecay11_146, ptr_tmp8, false, label_from_consts_exit_113);
+    std::vector<Value*> ptr_arraydecay16_indices;
+    ptr_arraydecay16_indices.push_back(const_int32_29);
+    ptr_arraydecay16_indices.push_back(const_int32_29);
+    Instruction* ptr_arraydecay16 = new GetElementPtrInst(ptr_results_105, ptr_arraydecay16_indices.begin(), ptr_arraydecay16_indices.end(), "arraydecay16", label_from_consts_exit_113);
+    StoreInst* void_148 = new StoreInst(ptr_arraydecay16, ptr_tmp12, false, label_from_consts_exit_113);
+    StoreInst* void_149 = new StoreInst(const_int32_29, ptr_tmp_117, false, label_from_consts_exit_113);
+    CallInst* void_150 = new CallInst(func_execute_shader, ptr_args_116, "", label_from_consts_exit_113);
+    void_150->setCallingConv(CallingConv::C);
+    void_150->setTailCall(false);const ParamAttrsList *void_150_PAL = 0;
+    void_150->setParamAttrs(void_150_PAL);
+    
+    LoadInst* int32_tmp23 = new LoadInst(ptr_tmp_117, "tmp23", false, label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay11_1_indices;
     ptr_arraydecay11_1_indices.push_back(const_int32_31);
     ptr_arraydecay11_1_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay11_1 = new GetElementPtrInst(ptr_inputs_117, ptr_arraydecay11_1_indices.begin(), ptr_arraydecay11_1_indices.end(), "arraydecay11.1", label_from_consts_exit_124);
-    StoreInst* void_164 = new StoreInst(ptr_arraydecay11_1, ptr_tmp8_156, false, label_from_consts_exit_124);
+    Instruction* ptr_arraydecay11_1 = new GetElementPtrInst(ptr_inputs_106, ptr_arraydecay11_1_indices.begin(), ptr_arraydecay11_1_indices.end(), "arraydecay11.1", label_from_consts_exit_113);
+    StoreInst* void_151 = new StoreInst(ptr_arraydecay11_1, ptr_tmp8, false, label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay16_1_indices;
     ptr_arraydecay16_1_indices.push_back(const_int32_31);
     ptr_arraydecay16_1_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay16_1 = new GetElementPtrInst(ptr_results_116, ptr_arraydecay16_1_indices.begin(), ptr_arraydecay16_1_indices.end(), "arraydecay16.1", label_from_consts_exit_124);
-    StoreInst* void_165 = new StoreInst(ptr_arraydecay16_1, ptr_tmp12_157, false, label_from_consts_exit_124);
-    StoreInst* void_166 = new StoreInst(const_int32_29, ptr_tmp, false, label_from_consts_exit_124);
-    CallInst* void_167 = new CallInst(func_execute_shader, ptr_args_127, "", label_from_consts_exit_124);
-    void_167->setCallingConv(CallingConv::C);
-    void_167->setTailCall(false);const ParamAttrsList *void_167_PAL = 0;
-    void_167->setParamAttrs(void_167_PAL);
-    
-    LoadInst* int32_tmp23_1 = new LoadInst(ptr_tmp, "tmp23.1", false, label_from_consts_exit_124);
-    BinaryOperator* int32_shl_1 = BinaryOperator::create(Instruction::Shl, int32_tmp23_1, const_int32_31, "shl.1", label_from_consts_exit_124);
-    BinaryOperator* int32_or_1 = BinaryOperator::create(Instruction::Or, int32_shl_1, int32_tmp23, "or.1", label_from_consts_exit_124);
+    Instruction* ptr_arraydecay16_1 = new GetElementPtrInst(ptr_results_105, ptr_arraydecay16_1_indices.begin(), ptr_arraydecay16_1_indices.end(), "arraydecay16.1", label_from_consts_exit_113);
+    StoreInst* void_152 = new StoreInst(ptr_arraydecay16_1, ptr_tmp12, false, label_from_consts_exit_113);
+    StoreInst* void_153 = new StoreInst(const_int32_29, ptr_tmp_117, false, label_from_consts_exit_113);
+    CallInst* void_154 = new CallInst(func_execute_shader, ptr_args_116, "", label_from_consts_exit_113);
+    void_154->setCallingConv(CallingConv::C);
+    void_154->setTailCall(false);const ParamAttrsList *void_154_PAL = 0;
+    void_154->setParamAttrs(void_154_PAL);
+    
+    LoadInst* int32_tmp23_1 = new LoadInst(ptr_tmp_117, "tmp23.1", false, label_from_consts_exit_113);
+    BinaryOperator* int32_shl_1 = BinaryOperator::create(Instruction::Shl, int32_tmp23_1, const_int32_31, "shl.1", label_from_consts_exit_113);
+    BinaryOperator* int32_or_1 = BinaryOperator::create(Instruction::Or, int32_shl_1, int32_tmp23, "or.1", label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay11_2_indices;
     ptr_arraydecay11_2_indices.push_back(const_int32_33);
     ptr_arraydecay11_2_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay11_2 = new GetElementPtrInst(ptr_inputs_117, ptr_arraydecay11_2_indices.begin(), ptr_arraydecay11_2_indices.end(), "arraydecay11.2", label_from_consts_exit_124);
-    StoreInst* void_168 = new StoreInst(ptr_arraydecay11_2, ptr_tmp8_156, false, label_from_consts_exit_124);
+    Instruction* ptr_arraydecay11_2 = new GetElementPtrInst(ptr_inputs_106, ptr_arraydecay11_2_indices.begin(), ptr_arraydecay11_2_indices.end(), "arraydecay11.2", label_from_consts_exit_113);
+    StoreInst* void_155 = new StoreInst(ptr_arraydecay11_2, ptr_tmp8, false, label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay16_2_indices;
     ptr_arraydecay16_2_indices.push_back(const_int32_33);
     ptr_arraydecay16_2_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay16_2 = new GetElementPtrInst(ptr_results_116, ptr_arraydecay16_2_indices.begin(), ptr_arraydecay16_2_indices.end(), "arraydecay16.2", label_from_consts_exit_124);
-    StoreInst* void_169 = new StoreInst(ptr_arraydecay16_2, ptr_tmp12_157, false, label_from_consts_exit_124);
-    StoreInst* void_170 = new StoreInst(const_int32_29, ptr_tmp, false, label_from_consts_exit_124);
-    CallInst* void_171 = new CallInst(func_execute_shader, ptr_args_127, "", label_from_consts_exit_124);
-    void_171->setCallingConv(CallingConv::C);
-    void_171->setTailCall(false);const ParamAttrsList *void_171_PAL = 0;
-    void_171->setParamAttrs(void_171_PAL);
-    
-    LoadInst* int32_tmp23_2 = new LoadInst(ptr_tmp, "tmp23.2", false, label_from_consts_exit_124);
-    BinaryOperator* int32_shl_2 = BinaryOperator::create(Instruction::Shl, int32_tmp23_2, const_int32_33, "shl.2", label_from_consts_exit_124);
-    BinaryOperator* int32_or_2 = BinaryOperator::create(Instruction::Or, int32_shl_2, int32_or_1, "or.2", label_from_consts_exit_124);
+    Instruction* ptr_arraydecay16_2 = new GetElementPtrInst(ptr_results_105, ptr_arraydecay16_2_indices.begin(), ptr_arraydecay16_2_indices.end(), "arraydecay16.2", label_from_consts_exit_113);
+    StoreInst* void_156 = new StoreInst(ptr_arraydecay16_2, ptr_tmp12, false, label_from_consts_exit_113);
+    StoreInst* void_157 = new StoreInst(const_int32_29, ptr_tmp_117, false, label_from_consts_exit_113);
+    CallInst* void_158 = new CallInst(func_execute_shader, ptr_args_116, "", label_from_consts_exit_113);
+    void_158->setCallingConv(CallingConv::C);
+    void_158->setTailCall(false);const ParamAttrsList *void_158_PAL = 0;
+    void_158->setParamAttrs(void_158_PAL);
+    
+    LoadInst* int32_tmp23_2 = new LoadInst(ptr_tmp_117, "tmp23.2", false, label_from_consts_exit_113);
+    BinaryOperator* int32_shl_2 = BinaryOperator::create(Instruction::Shl, int32_tmp23_2, const_int32_33, "shl.2", label_from_consts_exit_113);
+    BinaryOperator* int32_or_2 = BinaryOperator::create(Instruction::Or, int32_shl_2, int32_or_1, "or.2", label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay11_3_indices;
     ptr_arraydecay11_3_indices.push_back(const_int32_34);
     ptr_arraydecay11_3_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay11_3 = new GetElementPtrInst(ptr_inputs_117, ptr_arraydecay11_3_indices.begin(), ptr_arraydecay11_3_indices.end(), "arraydecay11.3", label_from_consts_exit_124);
-    StoreInst* void_172 = new StoreInst(ptr_arraydecay11_3, ptr_tmp8_156, false, label_from_consts_exit_124);
+    Instruction* ptr_arraydecay11_3 = new GetElementPtrInst(ptr_inputs_106, ptr_arraydecay11_3_indices.begin(), ptr_arraydecay11_3_indices.end(), "arraydecay11.3", label_from_consts_exit_113);
+    StoreInst* void_159 = new StoreInst(ptr_arraydecay11_3, ptr_tmp8, false, label_from_consts_exit_113);
     std::vector<Value*> ptr_arraydecay16_3_indices;
     ptr_arraydecay16_3_indices.push_back(const_int32_34);
     ptr_arraydecay16_3_indices.push_back(const_int32_29);
-    Instruction* ptr_arraydecay16_3 = new GetElementPtrInst(ptr_results_116, ptr_arraydecay16_3_indices.begin(), ptr_arraydecay16_3_indices.end(), "arraydecay16.3", label_from_consts_exit_124);
-    StoreInst* void_173 = new StoreInst(ptr_arraydecay16_3, ptr_tmp12_157, false, label_from_consts_exit_124);
-    StoreInst* void_174 = new StoreInst(const_int32_29, ptr_tmp, false, label_from_consts_exit_124);
-    CallInst* void_175 = new CallInst(func_execute_shader, ptr_args_127, "", label_from_consts_exit_124);
-    void_175->setCallingConv(CallingConv::C);
-    void_175->setTailCall(false);const ParamAttrsList *void_175_PAL = 0;
-    void_175->setParamAttrs(void_175_PAL);
-    
-    LoadInst* int32_tmp23_3 = new LoadInst(ptr_tmp, "tmp23.3", false, label_from_consts_exit_124);
-    BinaryOperator* int32_shl_3 = BinaryOperator::create(Instruction::Shl, int32_tmp23_3, const_int32_34, "shl.3", label_from_consts_exit_124);
-    BinaryOperator* int32_or_3 = BinaryOperator::create(Instruction::Or, int32_shl_3, int32_or_2, "or.3", label_from_consts_exit_124);
-    BinaryOperator* int32_neg = BinaryOperator::create(Instruction::Xor, int32_or_3, const_int32_30, "neg", label_from_consts_exit_124);
-    new ReturnInst(int32_neg, label_from_consts_exit_124);
+    Instruction* ptr_arraydecay16_3 = new GetElementPtrInst(ptr_results_105, ptr_arraydecay16_3_indices.begin(), ptr_arraydecay16_3_indices.end(), "arraydecay16.3", label_from_consts_exit_113);
+    StoreInst* void_160 = new StoreInst(ptr_arraydecay16_3, ptr_tmp12, false, label_from_consts_exit_113);
+    StoreInst* void_161 = new StoreInst(const_int32_29, ptr_tmp_117, false, label_from_consts_exit_113);
+    CallInst* void_162 = new CallInst(func_execute_shader, ptr_args_116, "", label_from_consts_exit_113);
+    void_162->setCallingConv(CallingConv::C);
+    void_162->setTailCall(false);const ParamAttrsList *void_162_PAL = 0;
+    void_162->setParamAttrs(void_162_PAL);
+    
+    LoadInst* int32_tmp23_3 = new LoadInst(ptr_tmp_117, "tmp23.3", false, label_from_consts_exit_113);
+    BinaryOperator* int32_shl_3 = BinaryOperator::create(Instruction::Shl, int32_tmp23_3, const_int32_34, "shl.3", label_from_consts_exit_113);
+    BinaryOperator* int32_or_3 = BinaryOperator::create(Instruction::Or, int32_shl_3, int32_or_2, "or.3", label_from_consts_exit_113);
+    BinaryOperator* int32_neg = BinaryOperator::create(Instruction::Xor, int32_or_3, const_int32_30, "neg", label_from_consts_exit_113);
+    new ReturnInst(int32_neg, label_from_consts_exit_113);
     
     // Resolve Forward References
-    fwdref_138->replaceAllUsesWith(packed_tmp31_i_150); delete fwdref_138;
-    fwdref_136->replaceAllUsesWith(int32_indvar_next7); delete fwdref_136;
+    fwdref_128->replaceAllUsesWith(packed_tmp31_i_140); delete fwdref_128;
+    fwdref_126->replaceAllUsesWith(int32_indvar_next7); delete fwdref_126;
     
   }
   
diff --git a/src/mesa/pipe/llvm/llvm_entry.c b/src/mesa/pipe/llvm/llvm_entry.c
index c3b34584e1..fa50b60e66 100644
--- a/src/mesa/pipe/llvm/llvm_entry.c
+++ b/src/mesa/pipe/llvm/llvm_entry.c
@@ -86,8 +86,8 @@ struct ShaderInput
 
 extern void execute_shader(struct ShaderInput *input);
 
-void run_vertex_shader(float4 (*inputs)[16],
-                       float4 (*results)[16],
+void run_vertex_shader(void *inputs,
+                       void *results,
                        float (*aconsts)[4],
                        int num_vertices,
                        int num_inputs,
@@ -98,16 +98,16 @@ void run_vertex_shader(float4 (*inputs)[16],
    float4  temps[128];//MAX_PROGRAM_TEMPS
 
    struct ShaderInput args;
+   args.dests  = results;
+   args.inputs = inputs;
+
    /*printf("XXX LLVM run_vertex_shader vertices = %d, inputs = %d, attribs = %d, consts = %d\n",
      num_vertices, num_inputs, num_attribs, num_consts);*/
    from_consts(consts, aconsts, num_consts);
    args.consts = consts;
    args.temps = temps;
-   for (int i = 0; i < num_vertices; ++i) {
-      args.dests  = results[i];
-      args.inputs = inputs[i];
-      execute_shader(&args);
-   }
+
+   execute_shader(&args);
 }
 
 
diff --git a/src/mesa/pipe/llvm/tgsitollvm.cpp b/src/mesa/pipe/llvm/tgsitollvm.cpp
new file mode 100644
index 0000000000..eb9e1196f1
--- /dev/null
+++ b/src/mesa/pipe/llvm/tgsitollvm.cpp
@@ -0,0 +1,682 @@
+#include "tgsitollvm.h"
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "storage.h"
+#include "instructions.h"
+
+#include "pipe/p_shader_tokens.h"
+
+#include "pipe/tgsi/util/tgsi_parse.h"
+#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+#include "pipe/tgsi/util/tgsi_build.h"
+#include "pipe/tgsi/util/tgsi_dump.h"
+
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+using namespace llvm;
+#include "llvm_base_shader.cpp"
+
+static inline void
+add_interpolator(struct gallivm_ir *ir,
+                 struct gallivm_interpolate *interp)
+{
+   ir->interpolators[ir->num_interp] = *interp;
+   ++ir->num_interp;
+}
+
+static void
+translate_declaration(struct gallivm_ir *prog,
+                      llvm::Module *module,
+                      Storage *storage,
+                      struct tgsi_full_declaration *decl,
+                      struct tgsi_full_declaration *fd)
+{
+   if (decl->Declaration.File == TGSI_FILE_INPUT) {
+      unsigned first, last, mask;
+      uint interp_method;
+
+      assert(decl->Declaration.Declare == TGSI_DECLARE_RANGE);
+
+      first = decl->u.DeclarationRange.First;
+      last = decl->u.DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      /* Do not touch WPOS.xy */
+      if (first == 0) {
+         mask &= ~TGSI_WRITEMASK_XY;
+         if (mask == TGSI_WRITEMASK_NONE) {
+            first++;
+            if (first > last) {
+               return;
+            }
+         }
+      }
+
+      interp_method = decl->Interpolation.Interpolate;
+
+      if (mask == TGSI_WRITEMASK_XYZW) {
+         unsigned i, j;
+
+         for (i = first; i <= last; i++) {
+            for (j = 0; j < NUM_CHANNELS; j++) {
+               //interp( mach, i, j );
+               struct gallivm_interpolate interp;
+               interp.type = interp_method;
+               interp.attrib = i;
+               interp.chan = j;
+               add_interpolator(prog, &interp);
+            }
+         }
+      } else {
+         unsigned i, j;
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               for( i = first; i <= last; i++ ) {
+                  struct gallivm_interpolate interp;
+                  interp.type = interp_method;
+                  interp.attrib = i;
+                  interp.chan = j;
+                  add_interpolator(prog, &interp);
+               }
+            }
+         }
+      }
+   }
+}
+
+
+static void
+translate_immediate(Storage *storage,
+                    struct tgsi_full_immediate *imm)
+{
+   float vec[4];
+   int i;
+   for (i = 0; i < imm->Immediate.Size - 1; ++i) {
+      switch( imm->Immediate.DataType ) {
+      case TGSI_IMM_FLOAT32:
+         vec[i] = imm->u.ImmediateFloat32[i].Float;
+         break;
+      default:
+         assert( 0 );
+      }
+   }
+   storage->addImmediate(vec);
+}
+
+static inline llvm::Value *
+swizzleVector(llvm::Value *val, struct tgsi_full_src_register *src,
+              Storage *storage)
+{
+   int swizzle = 0;
+   int start = 1000;
+   const int NO_SWIZZLE = TGSI_SWIZZLE_X * 1000 + TGSI_SWIZZLE_Y * 100 +
+                          TGSI_SWIZZLE_Z * 10 + TGSI_SWIZZLE_W;
+   for (int k = 0; k < 4; ++k) {
+      swizzle += tgsi_util_get_full_src_register_extswizzle(src, k) * start;
+      start /= 10;
+   }
+   if (swizzle != NO_SWIZZLE) {
+      /*fprintf(stderr, "XXXXXXXX swizzle = %d\n", swizzle);*/
+      val = storage->shuffleVector(val, swizzle);
+   }
+   return val;
+}
+
+static void
+translate_instruction(llvm::Module *module,
+                      Storage *storage,
+                      Instructions *instr,
+                      struct tgsi_full_instruction *inst,
+                      struct tgsi_full_instruction *fi,
+                      unsigned instno)
+{
+   llvm::Value *inputs[4];
+   inputs[0] = 0;
+   inputs[1] = 0;
+   inputs[2] = 0;
+   inputs[3] = 0;
+
+   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      llvm::Value *val = 0;
+      llvm::Value *indIdx = 0;
+
+      if (src->SrcRegister.Indirect) {
+         indIdx = storage->addrElement(src->SrcRegisterInd.Index);
+         indIdx = storage->extractIndex(indIdx);
+      }
+      if (src->SrcRegister.File == TGSI_FILE_CONSTANT) {
+         val = storage->constElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
+         val = storage->inputElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+         val = storage->tempElement(src->SrcRegister.Index);
+      } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
+         val = storage->outputElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+         val = storage->immediateElement(src->SrcRegister.Index);
+      } else {
+         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->SrcRegister.File);
+         return;
+      }
+
+      inputs[i] = swizzleVector(val, src, storage);
+   }
+
+   /*if (inputs[0])
+     instr->printVector(inputs[0]);
+     if (inputs[1])
+     instr->printVector(inputs[1]);*/
+   llvm::Value *out = 0;
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL: {
+      out = instr->arl(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_MOV: {
+      out = inputs[0];
+   }
+      break;
+   case TGSI_OPCODE_LIT: {
+      out = instr->lit(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCP: {
+      out = instr->rcp(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RSQ: {
+      out = instr->rsq(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_EXP:
+      break;
+   case TGSI_OPCODE_LOG:
+      break;
+   case TGSI_OPCODE_MUL: {
+      out = instr->mul(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_ADD: {
+      out = instr->add(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP3: {
+      out = instr->dp3(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP4: {
+      out = instr->dp4(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DST: {
+      out = instr->dst(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MIN: {
+      out = instr->min(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MAX: {
+      out = instr->max(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SGE: {
+      out = instr->sge(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MAD: {
+      out = instr->madd(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SUB: {
+      out = instr->sub(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_LERP: {
+      out = instr->lerp(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_CND:
+      break;
+   case TGSI_OPCODE_CND0:
+      break;
+   case TGSI_OPCODE_DOT2ADD:
+      break;
+   case TGSI_OPCODE_INDEX:
+      break;
+   case TGSI_OPCODE_NEGATE:
+      break;
+   case TGSI_OPCODE_FRAC: {
+      out = instr->frc(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_CLAMP:
+      break;
+   case TGSI_OPCODE_FLOOR: {
+      out = instr->floor(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_ROUND:
+      break;
+   case TGSI_OPCODE_EXPBASE2: {
+      out = instr->ex2(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_LOGBASE2: {
+      out = instr->lg2(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_POWER: {
+      out = instr->pow(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_CROSSPRODUCT: {
+      out = instr->cross(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      break;
+   case TGSI_OPCODE_ABS: {
+      out = instr->abs(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCC:
+      break;
+   case TGSI_OPCODE_DPH: {
+      out = instr->dph(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_COS: {
+      out = instr->cos(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_DDX:
+      break;
+   case TGSI_OPCODE_DDY:
+      break;
+   case TGSI_OPCODE_KILP: {
+      out = instr->kilp(inputs[0]);
+      storage->setKilElement(out);
+      return;
+   }
+      break;
+   case TGSI_OPCODE_PK2H:
+      break;
+   case TGSI_OPCODE_PK2US:
+      break;
+   case TGSI_OPCODE_PK4B:
+      break;
+   case TGSI_OPCODE_PK4UB:
+      break;
+   case TGSI_OPCODE_RFL:
+      break;
+   case TGSI_OPCODE_SEQ:
+      break;
+   case TGSI_OPCODE_SFL:
+      break;
+   case TGSI_OPCODE_SGT: {
+      out = instr->sgt(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SIN: {
+      out = instr->sin(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_SLE:
+      break;
+   case TGSI_OPCODE_SNE:
+      break;
+   case TGSI_OPCODE_STR:
+      break;
+   case TGSI_OPCODE_TEX:
+      break;
+   case TGSI_OPCODE_TXD:
+      break;
+   case TGSI_OPCODE_UP2H:
+      break;
+   case TGSI_OPCODE_UP2US:
+      break;
+   case TGSI_OPCODE_UP4B:
+      break;
+   case TGSI_OPCODE_UP4UB:
+      break;
+   case TGSI_OPCODE_X2D:
+      break;
+   case TGSI_OPCODE_ARA:
+      break;
+   case TGSI_OPCODE_ARR:
+      break;
+   case TGSI_OPCODE_BRA:
+      break;
+   case TGSI_OPCODE_CAL: {
+      instr->cal(inst->InstructionExtLabel.Label, storage->inputPtr());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_RET: {
+      instr->end();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_SSG:
+      break;
+   case TGSI_OPCODE_CMP: {
+      out = instr->cmp(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SCS: {
+      out = instr->scs(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_TXB:
+      break;
+   case TGSI_OPCODE_NRM:
+      break;
+   case TGSI_OPCODE_DIV:
+      break;
+   case TGSI_OPCODE_DP2:
+      break;
+   case TGSI_OPCODE_TXL:
+      break;
+   case TGSI_OPCODE_BRK: {
+      instr->brk();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_IF: {
+      instr->ifop(inputs[0]);
+      storage->setCurrentBlock(instr->currentBlock());
+      return;  //just update the state
+   }
+      break;
+   case TGSI_OPCODE_LOOP:
+      break;
+   case TGSI_OPCODE_REP:
+      break;
+   case TGSI_OPCODE_ELSE: {
+      instr->elseop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return; //only state update
+   }
+      break;
+   case TGSI_OPCODE_ENDIF: {
+      instr->endif();
+      storage->setCurrentBlock(instr->currentBlock());
+      return; //just update the state
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      break;
+   case TGSI_OPCODE_ENDREP:
+      break;
+   case TGSI_OPCODE_PUSHA:
+      break;
+   case TGSI_OPCODE_POPA:
+      break;
+   case TGSI_OPCODE_CEIL:
+      break;
+   case TGSI_OPCODE_I2F:
+      break;
+   case TGSI_OPCODE_NOT:
+      break;
+   case TGSI_OPCODE_TRUNC: {
+      out = instr->trunc(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_SHL:
+      break;
+   case TGSI_OPCODE_SHR:
+      break;
+   case TGSI_OPCODE_AND:
+      break;
+   case TGSI_OPCODE_OR:
+      break;
+   case TGSI_OPCODE_MOD:
+      break;
+   case TGSI_OPCODE_XOR:
+      break;
+   case TGSI_OPCODE_SAD:
+      break;
+   case TGSI_OPCODE_TXF:
+      break;
+   case TGSI_OPCODE_TXQ:
+      break;
+   case TGSI_OPCODE_CONT:
+      break;
+   case TGSI_OPCODE_EMIT:
+      break;
+   case TGSI_OPCODE_ENDPRIM:
+      break;
+   case TGSI_OPCODE_BGNLOOP2: {
+      instr->beginLoop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB: {
+      instr->bgnSub(instno);
+      storage->setCurrentBlock(instr->currentBlock());
+      storage->pushTemps();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP2: {
+      instr->endLoop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_ENDSUB: {
+      instr->endSub();
+      storage->setCurrentBlock(instr->currentBlock());
+      storage->popArguments();
+      storage->popTemps();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_NOISE1:
+      break;
+   case TGSI_OPCODE_NOISE2:
+      break;
+   case TGSI_OPCODE_NOISE3:
+      break;
+   case TGSI_OPCODE_NOISE4:
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_TEXBEM:
+      break;
+   case TGSI_OPCODE_TEXBEML:
+      break;
+   case TGSI_OPCODE_TEXREG2AR:
+      break;
+   case TGSI_OPCODE_TEXM3X2PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X2TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X3TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3SPEC:
+      break;
+   case TGSI_OPCODE_TEXM3X3VSPEC:
+      break;
+   case TGSI_OPCODE_TEXREG2GB:
+      break;
+   case TGSI_OPCODE_TEXREG2RGB:
+      break;
+   case TGSI_OPCODE_TEXDP3TEX:
+      break;
+   case TGSI_OPCODE_TEXDP3:
+      break;
+   case TGSI_OPCODE_TEXM3X3:
+      break;
+   case TGSI_OPCODE_TEXM3X2DEPTH:
+      break;
+   case TGSI_OPCODE_TEXDEPTH:
+      break;
+   case TGSI_OPCODE_BEM:
+      break;
+   case TGSI_OPCODE_M4X3:
+      break;
+   case TGSI_OPCODE_M3X4:
+      break;
+   case TGSI_OPCODE_M3X3:
+      break;
+   case TGSI_OPCODE_M3X2:
+      break;
+   case TGSI_OPCODE_NRM4:
+      break;
+   case TGSI_OPCODE_CALLNZ:
+      break;
+   case TGSI_OPCODE_IFC:
+      break;
+   case TGSI_OPCODE_BREAKC:
+      break;
+   case TGSI_OPCODE_KIL:
+      break;
+   case TGSI_OPCODE_END:
+      instr->end();
+      return;
+      break;
+   default:
+      fprintf(stderr, "ERROR: Unknown opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(0);
+      break;
+   }
+
+   if (!out) {
+      fprintf(stderr, "ERROR: unsupported opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(!"Unsupported opcode");
+   }
+
+   /* # not sure if we need this */
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      /*TXT( "_SAT" );*/
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      /*TXT( "_SAT[-1,1]" );*/
+      break;
+   default:
+      assert( 0 );
+   }
+
+   /* store results  */
+   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+         storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
+         storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
+         storage->setAddrElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else {
+         fprintf(stderr, "ERROR: unsupported LLVM destination!");
+         assert(!"wrong destination");
+      }
+   }
+}
+
+llvm::Module *
+tgsi_to_llvm(struct gallivm_ir *ir, const struct tgsi_token *tokens)
+{
+   llvm::Module *mod = createBaseShader();
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   unsigned instno = 0;
+   Function* shader = mod->getFunction("execute_shader");
+   std::ostringstream stream;
+   if (ir->type == GALLIVM_VS) {
+      stream << "vs_shader";
+   } else {
+      stream << "fs_shader";
+   }
+   stream << ir->id;
+   std::string func_name = stream.str();
+   shader->setName(func_name.c_str());
+
+   Function::arg_iterator args = shader->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("input");
+
+   BasicBlock *label_entry = new BasicBlock("entry", shader, 0);
+
+   tgsi_parse_init(&parse, tokens);
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+   Storage storage(label_entry, ptr_INPUT);
+   Instructions instr(mod, shader, label_entry, &storage);
+   while(!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         translate_declaration(ir, mod, &storage,
+                               &parse.FullToken.FullDeclaration,
+                               &fd);
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         translate_immediate(&storage,
+                             &parse.FullToken.FullImmediate);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         translate_instruction(mod, &storage, &instr,
+                               &parse.FullToken.FullInstruction,
+                               &fi, instno);
+         ++instno;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   ir->num_consts = storage.numConsts();
+   return mod;
+}
diff --git a/src/mesa/pipe/llvm/tgsitollvm.h b/src/mesa/pipe/llvm/tgsitollvm.h
new file mode 100644
index 0000000000..073ffb5749
--- /dev/null
+++ b/src/mesa/pipe/llvm/tgsitollvm.h
@@ -0,0 +1,16 @@
+#ifndef TGSITOLLVM_H
+#define TGSITOLLVM_H
+
+
+namespace llvm {
+   class Module;
+}
+
+struct gallivm_ir;
+struct tgsi_token;
+
+
+llvm::Module * tgsi_to_llvm(struct gallivm_ir *ir,
+                            const struct tgsi_token *tokens);
+
+#endif
diff --git a/src/mesa/pipe/softpipe/sp_state_fs.c b/src/mesa/pipe/softpipe/sp_state_fs.c
index 1430be7869..598a70c827 100644
--- a/src/mesa/pipe/softpipe/sp_state_fs.c
+++ b/src/mesa/pipe/softpipe/sp_state_fs.c
@@ -61,7 +61,7 @@ softpipe_create_fs_state(struct pipe_context *pipe,
    }
 
 #ifdef MESA_LLVM
-   state->llvm_prog = gallivm_from_tgsi(state->shader.tokens, GALLIVM_FS);
+   state->llvm_prog = 0;
    if (!gallivm_global_cpu_engine()) {
       gallivm_cpu_engine_create(state->llvm_prog);
    }
-- 
cgit v1.2.3


From 8f15140943a65e3e1488c8b43b2dffb1cd0a299c Mon Sep 17 00:00:00 2001
From: Zack Rusin <zack@tungstengraphics.com>
Date: Tue, 5 Feb 2008 03:09:24 -0500
Subject: rewrite the way cpu engine is handled

---
 src/mesa/pipe/draw/draw_vertex_shader.c |   9 +-
 src/mesa/pipe/llvm/Makefile             |   1 +
 src/mesa/pipe/llvm/gallivm.cpp          | 134 ---------------------
 src/mesa/pipe/llvm/gallivm.h            |  37 +++---
 src/mesa/pipe/llvm/gallivm_cpu.cpp      | 204 ++++++++++++++++++++++++++++++++
 5 files changed, 227 insertions(+), 158 deletions(-)
 create mode 100644 src/mesa/pipe/llvm/gallivm_cpu.cpp

diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 9567283ff5..574ac67057 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -116,11 +116,10 @@ run_vertex_program(struct draw_context *draw,
 #ifdef MESA_LLVM
    if (1) {
    struct gallivm_prog  *prog  = draw->vertex_shader->llvm_prog;
-   gallivm_prog_exec(prog,
-                     machine->Inputs,
-                     machine->Outputs,
-                     machine->Consts,
-                     12, 12, 12);
+   gallivm_cpu_vs_exec(prog,
+                       machine->Inputs,
+                       machine->Outputs,
+                       machine->Consts);
    } else
 #elif defined(__i386__) || defined(__386__)
    if (draw->use_sse) {
diff --git a/src/mesa/pipe/llvm/Makefile b/src/mesa/pipe/llvm/Makefile
index 10ff7aacae..b333661c6f 100644
--- a/src/mesa/pipe/llvm/Makefile
+++ b/src/mesa/pipe/llvm/Makefile
@@ -7,6 +7,7 @@ LIBNAME = gallivm
 
 GALLIVM_SOURCES = \
         gallivm.cpp  \
+        gallivm_cpu.cpp \
         instructions.cpp  \
         loweringpass.cpp \
         tgsitollvm.cpp \
diff --git a/src/mesa/pipe/llvm/gallivm.cpp b/src/mesa/pipe/llvm/gallivm.cpp
index cf9b0f6406..cb9a88f0b8 100644
--- a/src/mesa/pipe/llvm/gallivm.cpp
+++ b/src/mesa/pipe/llvm/gallivm.cpp
@@ -70,10 +70,6 @@
 #include <fstream>
 #include <iostream>
 
-struct gallivm_cpu_engine {
-   llvm::ExecutionEngine *engine;
-};
-
 static int GLOBAL_ID = 0;
 
 using namespace llvm;
@@ -145,38 +141,6 @@ void gallivm_prog_delete(struct gallivm_prog *prog)
    free(prog);
 }
 
-typedef void (*vertex_shader_runner)(void *ainputs,
-                                     void *dests,
-                                     float (*aconsts)[4],
-                                     int num_vertices,
-                                     int num_inputs,
-                                     int num_attribs,
-                                     int num_consts);
-
-
-/*!
-  This function is used to execute the gallivm_prog in software. Before calling
-  this function the gallivm_prog has to be JIT compiled with the gallivm_cpu_jit_compile
-  function.
- */
-int gallivm_prog_exec(struct gallivm_prog *prog,
-                      struct tgsi_exec_vector       *inputs,
-                      struct tgsi_exec_vector       *dests,
-                      float (*consts)[4],
-                      int num_vertices,
-                      int num_inputs,
-                      int num_attribs)
-{
-   vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-   assert(runner);
-   runner(inputs, dests, consts, num_vertices, num_inputs,
-          num_attribs, prog->num_consts);
-
-   return 0;
-}
-
-
-
 static inline void
 constant_interpolation(float (*inputs)[16][4],
                        const struct tgsi_interp_coef *coefs,
@@ -231,28 +195,6 @@ perspective_interpolation(float (*inputs)[16][4],
    }
 }
 
-typedef int (*fragment_shader_runner)(float x, float y,
-                                      float (*dests)[16][4],
-                                      float (*inputs)[16][4],
-                                      int num_attribs,
-                                      float (*consts)[4], int num_consts,
-                                      struct tgsi_sampler *samplers);
-
-int gallivm_fragment_shader_exec(struct gallivm_prog *prog,
-                                 float fx, float fy,
-                                 float (*dests)[16][4],
-                                 float (*inputs)[16][4],
-                                 float (*consts)[4],
-                                 struct tgsi_sampler *samplers)
-{
-   fragment_shader_runner runner = reinterpret_cast<fragment_shader_runner>(prog->function);
-   assert(runner);
-
-   return runner(fx, fy, dests, inputs, prog->num_interp,
-                 consts, prog->num_consts,
-                 samplers);
-}
-
 void gallivm_ir_dump(struct gallivm_ir *ir, const char *file_prefix)
 {
    if (!ir || !ir->module)
@@ -292,82 +234,6 @@ void gallivm_ir_dump(struct gallivm_ir *ir, const char *file_prefix)
 }
 
 
-static struct gallivm_cpu_engine *CPU = 0;
-
-static inline llvm::Function *func_for_shader(struct gallivm_prog *prog)
-{
-   llvm::Module *mod = prog->module;
-   llvm::Function *func = 0;
-
-   switch (prog->type) {
-   case GALLIVM_VS:
-      func = mod->getFunction("run_vertex_shader");
-      break;
-   case GALLIVM_FS:
-      func = mod->getFunction("run_fragment_shader");
-      break;
-   default:
-      assert(!"Unknown shader type!");
-      break;
-   }
-   return func;
-}
-
-/*!
-  This function creates a CPU based execution engine for the given gallivm_prog.
-  gallivm_cpu_engine should be used as a singleton throughout the library. Before
-  executing gallivm_prog_exec one needs to call gallivm_cpu_jit_compile.
-  The gallivm_prog instance which is being passed to the constructor is being
-  automatically JIT compiled so one shouldn't call gallivm_cpu_jit_compile
-  with it again.
- */
-struct gallivm_cpu_engine * gallivm_cpu_engine_create(struct gallivm_prog *prog)
-{
-   struct gallivm_cpu_engine *cpu = (struct gallivm_cpu_engine *)
-                                    calloc(1, sizeof(struct gallivm_cpu_engine));
-   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
-   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
-   llvm::ExecutionEngine *ee = llvm::ExecutionEngine::create(mp, false);
-   ee->DisableLazyCompilation();
-   cpu->engine = ee;
-
-   llvm::Function *func = func_for_shader(prog);
-
-   prog->function = ee->getPointerToFunction(func);
-   CPU = cpu;
-   return cpu;
-}
-
-
-/*!
-  This function JIT compiles the given gallivm_prog with the given cpu based execution engine.
-  The reference to the generated machine code entry point will be stored
-  in the gallivm_prog program. After executing this function one can call gallivm_prog_exec
-  in order to execute the gallivm_prog on the CPU.
- */
-void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog *prog)
-{
-   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
-   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
-   llvm::ExecutionEngine *ee = cpu->engine;
-   assert(ee);
-   ee->DisableLazyCompilation();
-   ee->addModuleProvider(mp);
-
-   llvm::Function *func = func_for_shader(prog);
-   prog->function = ee->getPointerToFunction(func);
-}
-
-void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *cpu)
-{
-   free(cpu);
-}
-
-struct gallivm_cpu_engine * gallivm_global_cpu_engine()
-{
-   return CPU;
-}
-
 void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
                                      float (*inputs)[16][4],
                                      const struct tgsi_interp_coef *coef)
diff --git a/src/mesa/pipe/llvm/gallivm.h b/src/mesa/pipe/llvm/gallivm.h
index f9f5d5ee74..98eda56f81 100644
--- a/src/mesa/pipe/llvm/gallivm.h
+++ b/src/mesa/pipe/llvm/gallivm.h
@@ -61,28 +61,16 @@ enum gallivm_vector_layout {
 };
 
 struct gallivm_ir *gallivm_ir_new(enum gallivm_shader_type type);
-void gallivm_ir_set_layout(struct gallivm_ir *prog,
-                           enum gallivm_vector_layout layout);
-void gallivm_ir_set_components(struct gallivm_ir *prog, int num);
-void gallivm_ir_fill_from_tgsi(struct gallivm_ir *prog,
-                               const struct tgsi_token *tokens);
-void gallivm_ir_delete(struct gallivm_ir *prog);
+void               gallivm_ir_set_layout(struct gallivm_ir *prog,
+                                         enum gallivm_vector_layout layout);
+void               gallivm_ir_set_components(struct gallivm_ir *prog, int num);
+void               gallivm_ir_fill_from_tgsi(struct gallivm_ir *prog,
+                                             const struct tgsi_token *tokens);
+void               gallivm_ir_delete(struct gallivm_ir *prog);
+
 
 struct gallivm_prog *gallivm_ir_compile(struct gallivm_ir *ir);
 
-int gallivm_prog_exec(struct gallivm_prog *prog,
-                      struct tgsi_exec_vector       *inputs,
-                      struct tgsi_exec_vector       *dests,
-                      float (*consts)[4],
-                      int num_vertices,
-                      int num_inputs,
-                      int num_attribs);
-int gallivm_fragment_shader_exec(struct gallivm_prog *prog,
-                                 float x, float y,
-                                 float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
-                                 float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
-                                 float (*consts)[4],
-                                 struct tgsi_sampler *samplers);
 void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
                                      float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
                                      const struct tgsi_interp_coef *coefs);
@@ -91,9 +79,20 @@ void gallivm_prog_dump(struct gallivm_prog *prog, const char *file_prefix);
 
 struct gallivm_cpu_engine *gallivm_cpu_engine_create(struct gallivm_prog *prog);
 struct gallivm_cpu_engine *gallivm_global_cpu_engine();
+int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
+                        struct tgsi_exec_vector       *inputs,
+                        struct tgsi_exec_vector       *dests,
+                        float (*consts)[4]);
+int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
+                        float x, float y,
+                        float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
+                        float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
+                        float (*consts)[4],
+                        struct tgsi_sampler *samplers);
 void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *ee, struct gallivm_prog *prog);
 void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *ee);
 
+
 #endif /* MESA_LLVM */
 
 #if defined __cplusplus
diff --git a/src/mesa/pipe/llvm/gallivm_cpu.cpp b/src/mesa/pipe/llvm/gallivm_cpu.cpp
new file mode 100644
index 0000000000..5f1268bf4f
--- /dev/null
+++ b/src/mesa/pipe/llvm/gallivm_cpu.cpp
@@ -0,0 +1,204 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+#ifdef MESA_LLVM
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "instructions.h"
+#include "loweringpass.h"
+#include "storage.h"
+#include "tgsitollvm.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "pipe/tgsi/util/tgsi_dump.h"
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+struct gallivm_cpu_engine {
+   llvm::ExecutionEngine *engine;
+};
+
+static struct gallivm_cpu_engine *CPU = 0;
+
+typedef int (*fragment_shader_runner)(float x, float y,
+                                      float (*dests)[16][4],
+                                      float (*inputs)[16][4],
+                                      int num_attribs,
+                                      float (*consts)[4], int num_consts,
+                                      struct tgsi_sampler *samplers);
+
+int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
+                        float fx, float fy,
+                        float (*dests)[16][4],
+                        float (*inputs)[16][4],
+                        float (*consts)[4],
+                        struct tgsi_sampler *samplers)
+{
+   fragment_shader_runner runner = reinterpret_cast<fragment_shader_runner>(prog->function);
+   assert(runner);
+
+   return runner(fx, fy, dests, inputs, prog->num_interp,
+                 consts, prog->num_consts,
+                 samplers);
+}
+
+static inline llvm::Function *func_for_shader(struct gallivm_prog *prog)
+{
+   llvm::Module *mod = prog->module;
+   llvm::Function *func = 0;
+
+   switch (prog->type) {
+   case GALLIVM_VS:
+      func = mod->getFunction("run_vertex_shader");
+      break;
+   case GALLIVM_FS:
+      func = mod->getFunction("run_fragment_shader");
+      break;
+   default:
+      assert(!"Unknown shader type!");
+      break;
+   }
+   return func;
+}
+
+/*!
+  This function creates a CPU based execution engine for the given gallivm_prog.
+  gallivm_cpu_engine should be used as a singleton throughout the library. Before
+  executing gallivm_prog_exec one needs to call gallivm_cpu_jit_compile.
+  The gallivm_prog instance which is being passed to the constructor is being
+  automatically JIT compiled so one shouldn't call gallivm_cpu_jit_compile
+  with it again.
+ */
+struct gallivm_cpu_engine * gallivm_cpu_engine_create(struct gallivm_prog *prog)
+{
+   struct gallivm_cpu_engine *cpu = (struct gallivm_cpu_engine *)
+                                    calloc(1, sizeof(struct gallivm_cpu_engine));
+   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
+   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
+   llvm::ExecutionEngine *ee = llvm::ExecutionEngine::create(mp, false);
+   ee->DisableLazyCompilation();
+   cpu->engine = ee;
+
+   llvm::Function *func = func_for_shader(prog);
+
+   prog->function = ee->getPointerToFunction(func);
+   CPU = cpu;
+   return cpu;
+}
+
+
+/*!
+  This function JIT compiles the given gallivm_prog with the given cpu based execution engine.
+  The reference to the generated machine code entry point will be stored
+  in the gallivm_prog program. After executing this function one can call gallivm_prog_exec
+  in order to execute the gallivm_prog on the CPU.
+ */
+void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog *prog)
+{
+   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
+   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
+   llvm::ExecutionEngine *ee = cpu->engine;
+   assert(ee);
+   /*FIXME : remove */
+   ee->DisableLazyCompilation();
+   ee->addModuleProvider(mp);
+
+   llvm::Function *func = func_for_shader(prog);
+   prog->function = ee->getPointerToFunction(func);
+}
+
+void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *cpu)
+{
+   free(cpu);
+}
+
+struct gallivm_cpu_engine * gallivm_global_cpu_engine()
+{
+   return CPU;
+}
+
+
+typedef void (*vertex_shader_runner)(void *ainputs,
+                                     void *dests,
+                                     float (*aconsts)[4],
+                                     int num_vertices,
+                                     int num_inputs,
+                                     int num_attribs,
+                                     int num_consts);
+
+
+/*!
+  This function is used to execute the gallivm_prog in software. Before calling
+  this function the gallivm_prog has to be JIT compiled with the gallivm_cpu_jit_compile
+  function.
+ */
+int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
+                        struct tgsi_exec_vector       *inputs,
+                        struct tgsi_exec_vector       *dests,
+                         float (*consts)[4])
+{
+   vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
+   assert(runner);
+   /*FIXME*/
+   runner(inputs, dests, consts, 4, 4, 4, prog->num_consts);
+
+   return 0;
+}
+
+#endif
-- 
cgit v1.2.3


From 6f04f0f8aa652595d7c53bda5eeb304145fd02f2 Mon Sep 17 00:00:00 2001
From: Zack Rusin <zack@tungstengraphics.com>
Date: Wed, 6 Feb 2008 06:12:34 -0500
Subject: disable llvm for fragment shaders for now

---
 src/mesa/pipe/softpipe/sp_state_fs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/pipe/softpipe/sp_state_fs.c b/src/mesa/pipe/softpipe/sp_state_fs.c
index 598a70c827..0b814fc284 100644
--- a/src/mesa/pipe/softpipe/sp_state_fs.c
+++ b/src/mesa/pipe/softpipe/sp_state_fs.c
@@ -62,11 +62,15 @@ softpipe_create_fs_state(struct pipe_context *pipe,
 
 #ifdef MESA_LLVM
    state->llvm_prog = 0;
+
+#if 0
    if (!gallivm_global_cpu_engine()) {
       gallivm_cpu_engine_create(state->llvm_prog);
    }
    else
       gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog);
+#endif
+
 #elif defined(__i386__) || defined(__386__)
    if (softpipe->use_sse) {
       x86_init_func( &state->sse2_program );
-- 
cgit v1.2.3


From 74295558492060694910892d843e89708272a271 Mon Sep 17 00:00:00 2001
From: Zack Rusin <zack@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:43:59 -0500
Subject: start genering soa type code in llvm paths

---
 src/mesa/pipe/llvm/Makefile            |   4 +-
 src/mesa/pipe/llvm/gallivm.cpp         |   3 +
 src/mesa/pipe/llvm/gallivm.h           |   8 +-
 src/mesa/pipe/llvm/instructionssoa.cpp |  26 ++
 src/mesa/pipe/llvm/instructionssoa.h   |  55 ++++
 src/mesa/pipe/llvm/storagesoa.cpp      | 117 ++++++++
 src/mesa/pipe/llvm/storagesoa.h        |  71 +++++
 src/mesa/pipe/llvm/tgsitollvm.cpp      | 506 ++++++++++++++++++++++++++++++++-
 src/mesa/pipe/llvm/tgsitollvm.h        |   4 +
 9 files changed, 782 insertions(+), 12 deletions(-)
 create mode 100644 src/mesa/pipe/llvm/instructionssoa.cpp
 create mode 100644 src/mesa/pipe/llvm/instructionssoa.h
 create mode 100644 src/mesa/pipe/llvm/storagesoa.cpp
 create mode 100644 src/mesa/pipe/llvm/storagesoa.h

diff --git a/src/mesa/pipe/llvm/Makefile b/src/mesa/pipe/llvm/Makefile
index b333661c6f..a0494ba966 100644
--- a/src/mesa/pipe/llvm/Makefile
+++ b/src/mesa/pipe/llvm/Makefile
@@ -11,7 +11,9 @@ GALLIVM_SOURCES = \
         instructions.cpp  \
         loweringpass.cpp \
         tgsitollvm.cpp \
-        storage.cpp
+        storage.cpp \
+        storagesoa.cpp \
+        instructionssoa.cpp
 
 INC_SOURCES = gallivm_builtins.cpp llvm_base_shader.cpp
 
diff --git a/src/mesa/pipe/llvm/gallivm.cpp b/src/mesa/pipe/llvm/gallivm.cpp
index cb9a88f0b8..b99dc6db5b 100644
--- a/src/mesa/pipe/llvm/gallivm.cpp
+++ b/src/mesa/pipe/llvm/gallivm.cpp
@@ -288,6 +288,9 @@ void gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
    std::cout << "Creating llvm from: " <<std::endl;
    tgsi_dump(tokens, 0);
 
+
+   llvm::Module *irmod = tgsi_to_llvmir(ir, tokens);
+
    llvm::Module *mod = tgsi_to_llvm(ir, tokens);
    ir->module = mod;
    gallivm_ir_dump(ir, 0);
diff --git a/src/mesa/pipe/llvm/gallivm.h b/src/mesa/pipe/llvm/gallivm.h
index 98eda56f81..b104520cb7 100644
--- a/src/mesa/pipe/llvm/gallivm.h
+++ b/src/mesa/pipe/llvm/gallivm.h
@@ -61,12 +61,12 @@ enum gallivm_vector_layout {
 };
 
 struct gallivm_ir *gallivm_ir_new(enum gallivm_shader_type type);
-void               gallivm_ir_set_layout(struct gallivm_ir *prog,
+void               gallivm_ir_set_layout(struct gallivm_ir *ir,
                                          enum gallivm_vector_layout layout);
-void               gallivm_ir_set_components(struct gallivm_ir *prog, int num);
-void               gallivm_ir_fill_from_tgsi(struct gallivm_ir *prog,
+void               gallivm_ir_set_components(struct gallivm_ir *ir, int num);
+void               gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
                                              const struct tgsi_token *tokens);
-void               gallivm_ir_delete(struct gallivm_ir *prog);
+void               gallivm_ir_delete(struct gallivm_ir *ir);
 
 
 struct gallivm_prog *gallivm_ir_compile(struct gallivm_ir *ir);
diff --git a/src/mesa/pipe/llvm/instructionssoa.cpp b/src/mesa/pipe/llvm/instructionssoa.cpp
new file mode 100644
index 0000000000..9ac4d8fbc7
--- /dev/null
+++ b/src/mesa/pipe/llvm/instructionssoa.cpp
@@ -0,0 +1,26 @@
+#include "instructionssoa.h"
+
+InstructionsSoa::InstructionsSoa(llvm::Module *mod, llvm::Function *func,
+                                 llvm::BasicBlock *block, StorageSoa *storage)
+{
+}
+
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   return res;
+}
+
+void InstructionsSoa::end()
+{
+}
diff --git a/src/mesa/pipe/llvm/instructionssoa.h b/src/mesa/pipe/llvm/instructionssoa.h
new file mode 100644
index 0000000000..0b6b41cf05
--- /dev/null
+++ b/src/mesa/pipe/llvm/instructionssoa.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INSTRUCTIONSSOA_H
+#define INSTRUCTIONSSOA_H
+
+#include <vector>
+
+namespace llvm {
+   class Module;
+   class Function;
+   class BasicBlock;
+   class Value;
+}
+class StorageSoa;
+
+class InstructionsSoa
+{
+public:
+   InstructionsSoa(llvm::Module *mod, llvm::Function *func,
+                   llvm::BasicBlock *block, StorageSoa *storage);
+
+   std::vector<llvm::Value*> add(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> mul(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   void         end();
+};
+
+
+#endif
diff --git a/src/mesa/pipe/llvm/storagesoa.cpp b/src/mesa/pipe/llvm/storagesoa.cpp
new file mode 100644
index 0000000000..b2aca3557a
--- /dev/null
+++ b/src/mesa/pipe/llvm/storagesoa.cpp
@@ -0,0 +1,117 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "storagesoa.h"
+
+
+#include "pipe/p_shader_tokens.h"
+#include <llvm/BasicBlock.h>
+#include <llvm/Module.h>
+#include <llvm/Value.h>
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+
+using namespace llvm;
+
+StorageSoa::StorageSoa(llvm::BasicBlock *block,
+                       llvm::Value *input,
+                       llvm::Value *output,
+                       llvm::Value *consts)
+{
+}
+
+void StorageSoa::addImmediate(float *vec)
+{
+}
+
+llvm::Value *StorageSoa::addrElement(int idx) const
+{
+   return 0;
+}
+
+std::vector<llvm::Value*> StorageSoa::inputElement(int idx, int swizzle,
+                                                   llvm::Value *indIdx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(int idx, int swizzle,
+                                                   llvm::Value *indIdx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::outputElement(int idx, int swizzle,
+                                                    llvm::Value *indIdx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::tempElement(int idx, int swizzle,
+                                                  llvm::Value *indIdx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::immediateElement(int idx, int swizzle)
+{
+   std::vector<llvm::Value*> res(4);
+
+   return res;
+}
+
+llvm::Value * StorageSoa::extractIndex(llvm::Value *vec)
+{
+   return 0;
+}
+
+void StorageSoa::storeOutput(int dstIdx, const std::vector<llvm::Value*> &val,
+                             int mask)
+{
+}
+
+void StorageSoa::storeTemp(int idx, const std::vector<llvm::Value*> &val,
+                           int mask)
+{
+}
+
+void StorageSoa::storeAddress(int idx, const std::vector<llvm::Value*> &val,
+                              int mask)
+{
+}
diff --git a/src/mesa/pipe/llvm/storagesoa.h b/src/mesa/pipe/llvm/storagesoa.h
new file mode 100644
index 0000000000..551b0b9734
--- /dev/null
+++ b/src/mesa/pipe/llvm/storagesoa.h
@@ -0,0 +1,71 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef STORAGESOA_H
+#define STORAGESOA_H
+
+#include <vector>
+
+namespace llvm {
+   class BasicBlock;
+   class Constant;
+   class ConstantInt;
+   class LoadInst;
+   class Value;
+   class VectorType;
+}
+
+class StorageSoa
+{
+public:
+   StorageSoa(llvm::BasicBlock *block,
+              llvm::Value *input,
+              llvm::Value *output,
+              llvm::Value *consts);
+
+   void addImmediate(float *vec);
+
+   llvm::Value  * addrElement(int idx) const;
+
+   std::vector<llvm::Value*> inputElement(int idx, int swizzle, llvm::Value *indIdx =0);
+   std::vector<llvm::Value*> constElement(int idx, int swizzle, llvm::Value *indIdx =0);
+   std::vector<llvm::Value*> outputElement(int idx, int swizzle, llvm::Value *indIdx =0);
+   std::vector<llvm::Value*> tempElement(int idx, int swizzle, llvm::Value *indIdx =0);
+   std::vector<llvm::Value*> immediateElement(int idx, int swizzle);
+
+   llvm::Value *extractIndex(llvm::Value *vec);
+
+   void storeOutput(int dstIdx, const std::vector<llvm::Value*> &val,
+                         int mask);
+   void storeTemp(int idx, const std::vector<llvm::Value*> &val,
+                  int mask);
+   void storeAddress(int idx, const std::vector<llvm::Value*> &val,
+                     int mask);
+
+};
+
+#endif
diff --git a/src/mesa/pipe/llvm/tgsitollvm.cpp b/src/mesa/pipe/llvm/tgsitollvm.cpp
index eb9e1196f1..bc4df61071 100644
--- a/src/mesa/pipe/llvm/tgsitollvm.cpp
+++ b/src/mesa/pipe/llvm/tgsitollvm.cpp
@@ -5,6 +5,8 @@
 
 #include "storage.h"
 #include "instructions.h"
+#include "storagesoa.h"
+#include "instructionssoa.h"
 
 #include "pipe/p_shader_tokens.h"
 
@@ -112,6 +114,14 @@ translate_declaration(struct gallivm_ir *prog,
    }
 }
 
+static void
+translate_declarationir(struct gallivm_ir *,
+                      llvm::Module *,
+                      StorageSoa *,
+                      struct tgsi_full_declaration *,
+                      struct tgsi_full_declaration *)
+{
+}
 
 static void
 translate_immediate(Storage *storage,
@@ -120,29 +130,56 @@ translate_immediate(Storage *storage,
    float vec[4];
    int i;
    for (i = 0; i < imm->Immediate.Size - 1; ++i) {
-      switch( imm->Immediate.DataType ) {
+      switch (imm->Immediate.DataType) {
       case TGSI_IMM_FLOAT32:
          vec[i] = imm->u.ImmediateFloat32[i].Float;
          break;
       default:
-         assert( 0 );
+         assert(0);
       }
    }
    storage->addImmediate(vec);
 }
 
-static inline llvm::Value *
-swizzleVector(llvm::Value *val, struct tgsi_full_src_register *src,
-              Storage *storage)
+
+static void
+translate_immediateir(StorageSoa *storage,
+                      struct tgsi_full_immediate *imm)
+{
+   float vec[4];
+   int i;
+   for (i = 0; i < imm->Immediate.Size - 1; ++i) {
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         vec[i] = imm->u.ImmediateFloat32[i].Float;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   storage->addImmediate(vec);
+}
+
+static inline int
+swizzleInt(struct tgsi_full_src_register *src)
 {
    int swizzle = 0;
    int start = 1000;
-   const int NO_SWIZZLE = TGSI_SWIZZLE_X * 1000 + TGSI_SWIZZLE_Y * 100 +
-                          TGSI_SWIZZLE_Z * 10 + TGSI_SWIZZLE_W;
+
    for (int k = 0; k < 4; ++k) {
       swizzle += tgsi_util_get_full_src_register_extswizzle(src, k) * start;
       start /= 10;
    }
+   return swizzle;
+}
+
+static inline llvm::Value *
+swizzleVector(llvm::Value *val, struct tgsi_full_src_register *src,
+              Storage *storage)
+{
+   int swizzle = swizzleInt(src);
+   const int NO_SWIZZLE = TGSI_SWIZZLE_X * 1000 + TGSI_SWIZZLE_Y * 100 +
+                          TGSI_SWIZZLE_Z * 10 + TGSI_SWIZZLE_W;
    if (swizzle != NO_SWIZZLE) {
       /*fprintf(stderr, "XXXXXXXX swizzle = %d\n", swizzle);*/
       val = storage->shuffleVector(val, swizzle);
@@ -617,6 +654,392 @@ translate_instruction(llvm::Module *module,
    }
 }
 
+
+static void
+translate_instructionir(llvm::Module *module,
+                        StorageSoa *storage,
+                        InstructionsSoa *instr,
+                        struct tgsi_full_instruction *inst,
+                        struct tgsi_full_instruction *fi,
+                        unsigned instno)
+{
+   std::vector< std::vector<llvm::Value*> > inputs(inst->Instruction.NumSrcRegs);
+
+   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      std::vector<llvm::Value*> val;
+      llvm::Value *indIdx = 0;
+      int swizzle = swizzleInt(src);
+
+      if (src->SrcRegister.Indirect) {
+         indIdx = storage->addrElement(src->SrcRegisterInd.Index);
+         indIdx = storage->extractIndex(indIdx);
+      }
+      if (src->SrcRegister.File == TGSI_FILE_CONSTANT) {
+         val = storage->constElement(src->SrcRegister.Index, swizzle, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
+         val = storage->inputElement(src->SrcRegister.Index, swizzle, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+         val = storage->tempElement(src->SrcRegister.Index, swizzle);
+      } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
+         val = storage->outputElement(src->SrcRegister.Index, swizzle, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+         val = storage->immediateElement(src->SrcRegister.Index, swizzle);
+      } else {
+         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->SrcRegister.File);
+         return;
+      }
+
+      inputs[i] = val;
+   }
+
+   std::vector<llvm::Value*> out(4);
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL: {
+   }
+      break;
+   case TGSI_OPCODE_MOV: {
+      out = inputs[0];
+   }
+      break;
+   case TGSI_OPCODE_LIT: {
+   }
+      break;
+   case TGSI_OPCODE_RCP: {
+   }
+      break;
+   case TGSI_OPCODE_RSQ: {
+   }
+      break;
+   case TGSI_OPCODE_EXP:
+      break;
+   case TGSI_OPCODE_LOG:
+      break;
+   case TGSI_OPCODE_MUL: {
+      out = instr->mul(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_ADD: {
+      out = instr->add(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP3: {
+   }
+      break;
+   case TGSI_OPCODE_DP4: {
+   }
+      break;
+   case TGSI_OPCODE_DST: {
+   }
+      break;
+   case TGSI_OPCODE_MIN: {
+   }
+      break;
+   case TGSI_OPCODE_MAX: {
+   }
+      break;
+   case TGSI_OPCODE_SLT: {
+   }
+      break;
+   case TGSI_OPCODE_SGE: {
+   }
+      break;
+   case TGSI_OPCODE_MAD: {
+   }
+      break;
+   case TGSI_OPCODE_SUB: {
+   }
+      break;
+   case TGSI_OPCODE_LERP: {
+   }
+      break;
+   case TGSI_OPCODE_CND:
+      break;
+   case TGSI_OPCODE_CND0:
+      break;
+   case TGSI_OPCODE_DOT2ADD:
+      break;
+   case TGSI_OPCODE_INDEX:
+      break;
+   case TGSI_OPCODE_NEGATE:
+      break;
+   case TGSI_OPCODE_FRAC: {
+   }
+      break;
+   case TGSI_OPCODE_CLAMP:
+      break;
+   case TGSI_OPCODE_FLOOR: {
+   }
+      break;
+   case TGSI_OPCODE_ROUND:
+      break;
+   case TGSI_OPCODE_EXPBASE2: {
+   }
+      break;
+   case TGSI_OPCODE_LOGBASE2: {
+   }
+      break;
+   case TGSI_OPCODE_POWER: {
+   }
+      break;
+   case TGSI_OPCODE_CROSSPRODUCT: {
+   }
+      break;
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      break;
+   case TGSI_OPCODE_ABS: {
+   }
+      break;
+   case TGSI_OPCODE_RCC:
+      break;
+   case TGSI_OPCODE_DPH: {
+   }
+      break;
+   case TGSI_OPCODE_COS: {
+   }
+      break;
+   case TGSI_OPCODE_DDX:
+      break;
+   case TGSI_OPCODE_DDY:
+      break;
+   case TGSI_OPCODE_KILP: {
+   }
+      break;
+   case TGSI_OPCODE_PK2H:
+      break;
+   case TGSI_OPCODE_PK2US:
+      break;
+   case TGSI_OPCODE_PK4B:
+      break;
+   case TGSI_OPCODE_PK4UB:
+      break;
+   case TGSI_OPCODE_RFL:
+      break;
+   case TGSI_OPCODE_SEQ:
+      break;
+   case TGSI_OPCODE_SFL:
+      break;
+   case TGSI_OPCODE_SGT: {
+   }
+      break;
+   case TGSI_OPCODE_SIN: {
+   }
+      break;
+   case TGSI_OPCODE_SLE:
+      break;
+   case TGSI_OPCODE_SNE:
+      break;
+   case TGSI_OPCODE_STR:
+      break;
+   case TGSI_OPCODE_TEX:
+      break;
+   case TGSI_OPCODE_TXD:
+      break;
+   case TGSI_OPCODE_UP2H:
+      break;
+   case TGSI_OPCODE_UP2US:
+      break;
+   case TGSI_OPCODE_UP4B:
+      break;
+   case TGSI_OPCODE_UP4UB:
+      break;
+   case TGSI_OPCODE_X2D:
+      break;
+   case TGSI_OPCODE_ARA:
+      break;
+   case TGSI_OPCODE_ARR:
+      break;
+   case TGSI_OPCODE_BRA:
+      break;
+   case TGSI_OPCODE_CAL: {
+   }
+      break;
+   case TGSI_OPCODE_RET: {
+   }
+      break;
+   case TGSI_OPCODE_SSG:
+      break;
+   case TGSI_OPCODE_CMP: {
+   }
+      break;
+   case TGSI_OPCODE_SCS: {
+   }
+      break;
+   case TGSI_OPCODE_TXB:
+      break;
+   case TGSI_OPCODE_NRM:
+      break;
+   case TGSI_OPCODE_DIV:
+      break;
+   case TGSI_OPCODE_DP2:
+      break;
+   case TGSI_OPCODE_TXL:
+      break;
+   case TGSI_OPCODE_BRK: {
+   }
+      break;
+   case TGSI_OPCODE_IF: {
+   }
+      break;
+   case TGSI_OPCODE_LOOP:
+      break;
+   case TGSI_OPCODE_REP:
+      break;
+   case TGSI_OPCODE_ELSE: {
+   }
+      break;
+   case TGSI_OPCODE_ENDIF: {
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      break;
+   case TGSI_OPCODE_ENDREP:
+      break;
+   case TGSI_OPCODE_PUSHA:
+      break;
+   case TGSI_OPCODE_POPA:
+      break;
+   case TGSI_OPCODE_CEIL:
+      break;
+   case TGSI_OPCODE_I2F:
+      break;
+   case TGSI_OPCODE_NOT:
+      break;
+   case TGSI_OPCODE_TRUNC: {
+   }
+      break;
+   case TGSI_OPCODE_SHL:
+      break;
+   case TGSI_OPCODE_SHR:
+      break;
+   case TGSI_OPCODE_AND:
+      break;
+   case TGSI_OPCODE_OR:
+      break;
+   case TGSI_OPCODE_MOD:
+      break;
+   case TGSI_OPCODE_XOR:
+      break;
+   case TGSI_OPCODE_SAD:
+      break;
+   case TGSI_OPCODE_TXF:
+      break;
+   case TGSI_OPCODE_TXQ:
+      break;
+   case TGSI_OPCODE_CONT:
+      break;
+   case TGSI_OPCODE_EMIT:
+      break;
+   case TGSI_OPCODE_ENDPRIM:
+      break;
+   case TGSI_OPCODE_BGNLOOP2: {
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB: {
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP2: {
+   }
+      break;
+   case TGSI_OPCODE_ENDSUB: {
+   }
+      break;
+   case TGSI_OPCODE_NOISE1:
+      break;
+   case TGSI_OPCODE_NOISE2:
+      break;
+   case TGSI_OPCODE_NOISE3:
+      break;
+   case TGSI_OPCODE_NOISE4:
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_TEXBEM:
+      break;
+   case TGSI_OPCODE_TEXBEML:
+      break;
+   case TGSI_OPCODE_TEXREG2AR:
+      break;
+   case TGSI_OPCODE_TEXM3X2PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X2TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X3TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3SPEC:
+      break;
+   case TGSI_OPCODE_TEXM3X3VSPEC:
+      break;
+   case TGSI_OPCODE_TEXREG2GB:
+      break;
+   case TGSI_OPCODE_TEXREG2RGB:
+      break;
+   case TGSI_OPCODE_TEXDP3TEX:
+      break;
+   case TGSI_OPCODE_TEXDP3:
+      break;
+   case TGSI_OPCODE_TEXM3X3:
+      break;
+   case TGSI_OPCODE_TEXM3X2DEPTH:
+      break;
+   case TGSI_OPCODE_TEXDEPTH:
+      break;
+   case TGSI_OPCODE_BEM:
+      break;
+   case TGSI_OPCODE_M4X3:
+      break;
+   case TGSI_OPCODE_M3X4:
+      break;
+   case TGSI_OPCODE_M3X3:
+      break;
+   case TGSI_OPCODE_M3X2:
+      break;
+   case TGSI_OPCODE_NRM4:
+      break;
+   case TGSI_OPCODE_CALLNZ:
+      break;
+   case TGSI_OPCODE_IFC:
+      break;
+   case TGSI_OPCODE_BREAKC:
+      break;
+   case TGSI_OPCODE_KIL:
+      break;
+   case TGSI_OPCODE_END:
+      instr->end();
+      return;
+      break;
+   default:
+      fprintf(stderr, "ERROR: Unknown opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(0);
+      break;
+   }
+
+   if (!out[0]) {
+      fprintf(stderr, "ERROR: unsupported opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(!"Unsupported opcode");
+   }
+
+   /* store results  */
+   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+         storage->storeOutput(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
+         storage->storeTemp(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
+         storage->storeAddress(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else {
+         fprintf(stderr, "ERROR: unsupported LLVM destination!");
+         assert(!"wrong destination");
+      }
+   }
+}
+
 llvm::Module *
 tgsi_to_llvm(struct gallivm_ir *ir, const struct tgsi_token *tokens)
 {
@@ -680,3 +1103,72 @@ tgsi_to_llvm(struct gallivm_ir *ir, const struct tgsi_token *tokens)
    ir->num_consts = storage.numConsts();
    return mod;
 }
+
+llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
+                              const struct tgsi_token *tokens)
+{
+   llvm::Module *mod = createBaseShader();
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   unsigned instno = 0;
+   Function* shader = mod->getFunction("execute_shader");
+   std::ostringstream stream;
+   if (ir->type == GALLIVM_VS) {
+      stream << "vs_shader";
+   } else {
+      stream << "fs_shader";
+   }
+   stream << ir->id;
+   std::string func_name = stream.str();
+   shader->setName(func_name.c_str());
+
+   Function::arg_iterator args = shader->arg_begin();
+   Value *input = args++;
+   input->setName("input");
+   Value *output = args++;
+   output->setName("output");
+   Value *consts = args++;
+   consts->setName("consts");
+
+   BasicBlock *label_entry = new BasicBlock("entry", shader, 0);
+
+   tgsi_parse_init(&parse, tokens);
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+
+   StorageSoa storage(label_entry, input, output, consts);
+   InstructionsSoa instr(mod, shader, label_entry, &storage);
+
+   while(!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         translate_declarationir(ir, mod, &storage,
+                                 &parse.FullToken.FullDeclaration,
+                                 &fd);
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         translate_immediateir(&storage,
+                             &parse.FullToken.FullImmediate);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         translate_instructionir(mod, &storage, &instr,
+                                 &parse.FullToken.FullInstruction,
+                                 &fi, instno);
+         ++instno;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   return mod;
+}
diff --git a/src/mesa/pipe/llvm/tgsitollvm.h b/src/mesa/pipe/llvm/tgsitollvm.h
index 073ffb5749..7ada04d629 100644
--- a/src/mesa/pipe/llvm/tgsitollvm.h
+++ b/src/mesa/pipe/llvm/tgsitollvm.h
@@ -13,4 +13,8 @@ struct tgsi_token;
 llvm::Module * tgsi_to_llvm(struct gallivm_ir *ir,
                             const struct tgsi_token *tokens);
 
+
+llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
+                              const struct tgsi_token *tokens);
+
 #endif
-- 
cgit v1.2.3


From a8637fb991e0b897a3574e0dc7d0ce15d7cbe8bc Mon Sep 17 00:00:00 2001
From: Zack Rusin <zack@tungstengraphics.com>
Date: Mon, 11 Feb 2008 10:47:28 -0500
Subject: remove seperate llvm vs entry points

they're not necessary anymore. we use the same paths
as sse and tgsi code
---
 src/mesa/pipe/draw/draw_vertex_shader_llvm.c | 194 ---------------------------
 src/mesa/sources                             |   1 -
 2 files changed, 195 deletions(-)
 delete mode 100644 src/mesa/pipe/draw/draw_vertex_shader_llvm.c

diff --git a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
deleted file mode 100644
index 63551c993e..0000000000
--- a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin zack@tungstengraphics.com
-  */
-
-#include "pipe/p_util.h"
-#include "draw_private.h"
-#include "draw_context.h"
-
-#ifdef MESA_LLVM
-
-#include "pipe/llvm/gallivm.h"
-#include "pipe/p_shader_tokens.h"
-
-#define DBG 0
-
-static INLINE void
-fetch_attrib4(const void *ptr, enum pipe_format format, float attrib[4])
-{
-   /* defaults */
-   attrib[1] = 0.0;
-   attrib[2] = 0.0;
-   attrib[3] = 1.0;
-   switch (format) {
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      attrib[3] = ((float *) ptr)[3];
-      /* fall-through */
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      attrib[2] = ((float *) ptr)[2];
-      /* fall-through */
-   case PIPE_FORMAT_R32G32_FLOAT:
-      attrib[1] = ((float *) ptr)[1];
-      /* fall-through */
-   case PIPE_FORMAT_R32_FLOAT:
-      attrib[0] = ((float *) ptr)[0];
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * Fetch vertex attributes for 'count' vertices.
- */
-static INLINE
-void vertex_fetch(struct draw_context *draw,
-                  const unsigned elt,
-                  float (*inputs)[4])
-{
-   uint attr;
-
-   /* loop over vertex attributes (vertex shader inputs) */
-   for (attr = 0; attr < draw->vertex_shader->state->num_inputs; attr++) {
-
-      unsigned buf = draw->vertex_element[attr].vertex_buffer_index;
-      const void *src
-         = (const void *) ((const ubyte *) draw->user.vbuffer[buf]
-                           + draw->vertex_buffer[buf].buffer_offset
-                           + draw->vertex_element[attr].src_offset
-                           + elt * draw->vertex_buffer[buf].pitch);
-      fetch_attrib4(src, draw->vertex_element[attr].src_format, inputs[attr]);
-   }
-}
-
-static INLINE unsigned
-compute_clipmask(const float *clip, const float (*plane)[4], unsigned nr)
-{
-   unsigned mask = 0;
-   unsigned i;
-
-   for (i = 0; i < nr; i++) {
-      if (dot4(clip, plane[i]) < 0)
-         mask |= (1<<i);
-   }
-
-   return mask;
-}
-
-
-/**
- * Called by the draw module when the vertx cache needs to be flushed.
- * This involves running the vertex shader.
- */
-void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
-{
-   unsigned i;
-
-   struct vertex_header *dests[VS_QUEUE_LENGTH];
-   float                 inputs[VS_QUEUE_LENGTH][PIPE_MAX_SHADER_INPUTS][4] ALIGN16_ATTRIB;
-   float                 outputs[VS_QUEUE_LENGTH][PIPE_MAX_SHADER_INPUTS][4] ALIGN16_ATTRIB;
-   float (*consts)[4]          = (float (*)[4]) draw->user.constants;
-   struct gallivm_prog  *prog  = draw->vertex_shader->llvm_prog;
-   const float          *scale = draw->viewport.scale;
-   const float          *trans = draw->viewport.translate;
-   /* fetch the inputs */
-   for (i = 0; i < draw->vs.queue_nr; ++i) {
-      unsigned elt = draw->vs.queue[i].elt;
-      dests[i] = draw->vs.queue[i].dest;
-      vertex_fetch(draw, elt, inputs[i]);
-   }
-
-   /* batch execute the shaders on all the vertices */
-   gallivm_prog_exec(prog, inputs, outputs, consts,
-                     draw->vs.queue_nr,
-                     draw->vertex_shader->state->num_inputs,
-                     draw->vertex_shader->state->num_outputs);
-
-
-   /* store machine results */
-   for (int i = 0; i < draw->vs.queue_nr; ++i) {
-      unsigned slot;
-      float x, y, z, w;
-      struct vertex_header *vOut = draw->vs.queue[i].dest;
-      float (*dests)[4] = outputs[i];
-
-      /* Handle attr[0] (position) specially:
-       *
-       * XXX: Computing the clipmask should be done in the vertex
-       * program as a set of DP4 instructions appended to the
-       * user-provided code.
-       */
-      x = vOut->clip[0] = dests[0][0];
-      y = vOut->clip[1] = dests[0][1];
-      z = vOut->clip[2] = dests[0][2];
-      w = vOut->clip[3] = dests[0][3];
-#if DBG
-      debug_printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
-#endif
-
-      vOut->clipmask = compute_clipmask(vOut->clip, draw->plane, draw->nr_planes);
-      vOut->edgeflag = 1;
-      /* divide by w */
-      w = 1.0f / w;
-      x *= w;
-      y *= w;
-      z *= w;
-
-      /* Viewport mapping */
-      vOut->data[0][0] = x * scale[0] + trans[0];
-      vOut->data[0][1] = y * scale[1] + trans[1];
-      vOut->data[0][2] = z * scale[2] + trans[2];
-      vOut->data[0][3] = w;
-
-      /* Remaining attributes are packed into sequential post-transform
-       * vertex attrib slots.
-       */
-      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut->data[slot][0] = dests[slot][0];
-         vOut->data[slot][1] = dests[slot][1];
-         vOut->data[slot][2] = dests[slot][2];
-         vOut->data[slot][3] = dests[slot][3];
-
-#if DBG
-         debug_printf("output %d: %f %f %f %f\n", slot,
-                vOut->data[slot][0],
-                vOut->data[slot][1],
-                vOut->data[slot][2],
-                vOut->data[slot][3]);
-#endif
-      }
-   } /* loop over vertices */
-
-   draw->vs.queue_nr = 0;
-}
-
-#endif /* MESA_LLVM */
diff --git a/src/mesa/sources b/src/mesa/sources
index 84492c91ac..96ae3dbca0 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -174,7 +174,6 @@ DRAW_SOURCES = \
 	pipe/draw/draw_vertex_cache.c \
 	pipe/draw/draw_vertex_fetch.c \
 	pipe/draw/draw_vertex_shader.c \
-	pipe/draw/draw_vertex_shader_llvm.c \
 	pipe/draw/draw_vf.c \
 	pipe/draw/draw_vf_generic.c \
 	pipe/draw/draw_vf_sse.c \
-- 
cgit v1.2.3


From f554db1893749043fe5b2906f7075588be178884 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 16:35:44 -0700
Subject: checkpoint- remove dependencies on gl_texture_format to make code
 re-usable by state tracker

---
 src/mesa/main/mipmap.c | 424 +++++++++++++++++++++++++++++++------------------
 1 file changed, 266 insertions(+), 158 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 1e61829e8f..013dc3752e 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -36,27 +36,205 @@
 
 
+static GLint
+bytes_per_pixel(GLenum datatype, GLuint comps)
+{
+   GLint b = _mesa_sizeof_packed_type(datatype);
+   assert(b >= 0);
+   return b * comps;
+}
+
+
+static void
+mesa_format_to_type_and_comps(const struct gl_texture_format *format,
+                              GLenum *datatype, GLuint *comps)
+{
+   switch (format->MesaFormat) {
+   case MESA_FORMAT_RGBA8888:
+   case MESA_FORMAT_RGBA8888_REV:
+   case MESA_FORMAT_ARGB8888:
+   case MESA_FORMAT_ARGB8888_REV:
+      *datatype = CHAN_TYPE;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGB888:
+   case MESA_FORMAT_BGR888:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_RGB565:
+   case MESA_FORMAT_RGB565_REV:
+      *datatype = GL_UNSIGNED_SHORT_5_6_5;
+      *comps = 3;
+      return;
+
+   case MESA_FORMAT_ARGB4444:
+   case MESA_FORMAT_ARGB4444_REV:
+      *datatype = GL_UNSIGNED_SHORT_4_4_4_4;
+      *comps = 4;
+      return;
+
+   case MESA_FORMAT_ARGB1555:
+   case MESA_FORMAT_ARGB1555_REV:
+      *datatype = GL_UNSIGNED_SHORT_1_5_5_5_REV;
+      *comps = 3;
+      return;
+
+   case MESA_FORMAT_AL88:
+   case MESA_FORMAT_AL88_REV:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_RGB332:
+      *datatype = GL_UNSIGNED_BYTE_3_3_2;
+      *comps = 3;
+      return;
+
+   case MESA_FORMAT_A8:
+   case MESA_FORMAT_L8:
+   case MESA_FORMAT_I8:
+   case MESA_FORMAT_CI8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_YCBCR:
+   case MESA_FORMAT_YCBCR_REV:
+      *datatype = GL_UNSIGNED_SHORT;
+      *comps = 2;
+      return;
+
+   case MESA_FORMAT_Z24_S8:
+      *datatype = GL_UNSIGNED_INT;
+      *comps = 1; /* XXX OK? */
+      return;
+
+   case MESA_FORMAT_Z16:
+      *datatype = GL_UNSIGNED_SHORT;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_Z32:
+      *datatype = GL_UNSIGNED_INT;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_SRGB8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_SRGBA8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_SL8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 1;
+      return;
+   case MESA_FORMAT_SLA8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 2;
+      return;
+
+   case MESA_FORMAT_RGB_FXT1:
+   case MESA_FORMAT_RGBA_FXT1:
+   case MESA_FORMAT_RGB_DXT1:
+   case MESA_FORMAT_RGBA_DXT1:
+   case MESA_FORMAT_RGBA_DXT3:
+   case MESA_FORMAT_RGBA_DXT5:
+      /* XXX generate error instead? */
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 0;
+      return;
+
+   case MESA_FORMAT_RGBA:
+      *datatype = CHAN_TYPE;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGB:
+      *datatype = CHAN_TYPE;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_LUMINANCE_ALPHA:
+      *datatype = CHAN_TYPE;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_ALPHA:
+   case MESA_FORMAT_LUMINANCE:
+   case MESA_FORMAT_INTENSITY:
+      *datatype = CHAN_TYPE;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_RGBA_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGBA_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGB_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_RGB_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_ALPHA_FLOAT32:
+   case MESA_FORMAT_LUMINANCE_FLOAT32:
+   case MESA_FORMAT_INTENSITY_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 1;
+      return;
+   case MESA_FORMAT_ALPHA_FLOAT16:
+   case MESA_FORMAT_LUMINANCE_FLOAT16:
+   case MESA_FORMAT_INTENSITY_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 1;
+      return;
+
+   default:
+      _mesa_problem(NULL, "bad texture format in mesa_format_to_type_and_comps");
+      *datatype = 0;
+      *comps = 1;
+   }
+}
+
+
 /**
  * Average together two rows of a source image to produce a single new
  * row in the dest image.  It's legal for the two source rows to point
  * to the same data.  The source width must be equal to either the
  * dest width or two times the dest width.
+ * \param datatype  GL_UNSIGNED_BYTE, GL_UNSIGNED_SHORT, GL_FLOAT, etc.
+ * \param comps  number of components per pixel (1..4)
  */
 static void
-do_row(const struct gl_texture_format *format, GLint srcWidth,
+do_row(GLenum datatype, GLuint comps, GLint srcWidth,
        const GLvoid *srcRowA, const GLvoid *srcRowB,
        GLint dstWidth, GLvoid *dstRow)
 {
    const GLuint k0 = (srcWidth == dstWidth) ? 0 : 1;
    const GLuint colStride = (srcWidth == dstWidth) ? 1 : 2;
 
+   ASSERT(comps >= 1);
+   ASSERT(comps <= 4);
+
    /* This assertion is no longer valid with non-power-of-2 textures
    assert(srcWidth == dstWidth || srcWidth == 2 * dstWidth);
    */
 
-   switch (format->MesaFormat) {
-   case MESA_FORMAT_RGBA:
-      {
+   if (datatype == CHAN_TYPE && comps == 4) {
          GLuint i, j, k;
          const GLchan (*rowA)[4] = (const GLchan (*)[4]) srcRowA;
          const GLchan (*rowB)[4] = (const GLchan (*)[4]) srcRowB;
@@ -72,10 +250,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][3] = (rowA[j][3] + rowA[k][3] +
                          rowB[j][3] + rowB[k][3]) / 4;
          }
-      }
-      return;
-   case MESA_FORMAT_RGB:
-      {
+   }
+   else if (datatype == CHAN_TYPE && comps == 3) {
          GLuint i, j, k;
          const GLchan (*rowA)[3] = (const GLchan (*)[3]) srcRowA;
          const GLchan (*rowB)[3] = (const GLchan (*)[3]) srcRowB;
@@ -89,12 +265,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][2] = (rowA[j][2] + rowA[k][2] +
                          rowB[j][2] + rowB[k][2]) / 4;
          }
-      }
-      return;
-   case MESA_FORMAT_ALPHA:
-   case MESA_FORMAT_LUMINANCE:
-   case MESA_FORMAT_INTENSITY:
-      {
+   }
+   else if (datatype == CHAN_TYPE && comps == 1) {
          GLuint i, j, k;
          const GLchan *rowA = (const GLchan *) srcRowA;
          const GLchan *rowB = (const GLchan *) srcRowB;
@@ -103,10 +275,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
               i++, j += colStride, k += colStride) {
             dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
          }
-      }
-      return;
-   case MESA_FORMAT_LUMINANCE_ALPHA:
-      {
+   }
+   else if (datatype == CHAN_TYPE && comps == 2) {
          GLuint i, j, k;
          const GLchan (*rowA)[2] = (const GLchan (*)[2]) srcRowA;
          const GLchan (*rowB)[2] = (const GLchan (*)[2]) srcRowB;
@@ -118,10 +288,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][1] = (rowA[j][1] + rowA[k][1] +
                          rowB[j][1] + rowB[k][1]) / 4;
          }
-      }
-      return;
-   case MESA_FORMAT_Z32:
-      {
+   }
+   else if (datatype == GL_UNSIGNED_INT && comps == 1) {
          GLuint i, j, k;
          const GLuint *rowA = (const GLuint *) srcRowA;
          const GLuint *rowB = (const GLuint *) srcRowB;
@@ -130,10 +298,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
               i++, j += colStride, k += colStride) {
             dst[i] = rowA[j] / 4 + rowA[k] / 4 + rowB[j] / 4 + rowB[k] / 4;
          }
-      }
-      return;
-   case MESA_FORMAT_Z16:
-      {
+   }
+   else if (datatype == GL_UNSIGNED_SHORT && comps == 1) {
          GLuint i, j, k;
          const GLushort *rowA = (const GLushort *) srcRowA;
          const GLushort *rowB = (const GLushort *) srcRowB;
@@ -142,17 +308,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
               i++, j += colStride, k += colStride) {
             dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
          }
-      }
-      return;
-   /* Begin hardware formats */
-   case MESA_FORMAT_RGBA8888:
-   case MESA_FORMAT_RGBA8888_REV:
-   case MESA_FORMAT_ARGB8888:
-   case MESA_FORMAT_ARGB8888_REV:
-#if FEATURE_EXT_texture_sRGB
-   case MESA_FORMAT_SRGBA8:
-#endif
-      {
+   }
+   else if (datatype == GL_UNSIGNED_BYTE && comps == 4) {
          GLuint i, j, k;
          const GLubyte (*rowA)[4] = (const GLubyte (*)[4]) srcRowA;
          const GLubyte (*rowB)[4] = (const GLubyte (*)[4]) srcRowB;
@@ -168,14 +325,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][3] = (rowA[j][3] + rowA[k][3] +
                          rowB[j][3] + rowB[k][3]) / 4;
          }
-      }
-      return;
-   case MESA_FORMAT_RGB888:
-   case MESA_FORMAT_BGR888:
-#if FEATURE_EXT_texture_sRGB
-   case MESA_FORMAT_SRGB8:
-#endif
-      {
+   }
+   else if (datatype == GL_UNSIGNED_BYTE && comps == 3) {
          GLuint i, j, k;
          const GLubyte (*rowA)[3] = (const GLubyte (*)[3]) srcRowA;
          const GLubyte (*rowB)[3] = (const GLubyte (*)[3]) srcRowB;
@@ -189,11 +340,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][2] = (rowA[j][2] + rowA[k][2] +
                          rowB[j][2] + rowB[k][2]) / 4;
          }
-      }
-      return;
-   case MESA_FORMAT_RGB565:
-   case MESA_FORMAT_RGB565_REV:
-      {
+   }
+   else if (datatype == GL_UNSIGNED_SHORT_5_6_5 && comps == 3) {
          GLuint i, j, k;
          const GLushort *rowA = (const GLushort *) srcRowA;
          const GLushort *rowB = (const GLushort *) srcRowB;
@@ -217,11 +365,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             const GLint blue  = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
             dst[i] = (blue << 11) | (green << 5) | red;
          }
-      }
-      return;
-   case MESA_FORMAT_ARGB4444:
-   case MESA_FORMAT_ARGB4444_REV:
-      {
+   }
+   else if (datatype == GL_UNSIGNED_SHORT_4_4_4_4 && comps == 4) {
          GLuint i, j, k;
          const GLushort *rowA = (const GLushort *) srcRowA;
          const GLushort *rowB = (const GLushort *) srcRowB;
@@ -250,11 +395,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             const GLint alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
             dst[i] = (alpha << 12) | (blue << 8) | (green << 4) | red;
          }
-      }
-      return;
-   case MESA_FORMAT_ARGB1555:
-   case MESA_FORMAT_ARGB1555_REV: /* XXX broken? */
-      {
+   }
+   else if (datatype == GL_UNSIGNED_SHORT_1_5_5_5_REV && comps == 4) {
          GLuint i, j, k;
          const GLushort *rowA = (const GLushort *) srcRowA;
          const GLushort *rowB = (const GLushort *) srcRowB;
@@ -283,14 +425,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             const GLint alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
             dst[i] = (alpha << 15) | (blue << 10) | (green << 5) | red;
          }
-      }
-      return;
-   case MESA_FORMAT_AL88:
-   case MESA_FORMAT_AL88_REV:
-#if FEATURE_EXT_texture_sRGB
-   case MESA_FORMAT_SLA8:
-#endif
-      {
+   }
+   else if (datatype == GL_UNSIGNED_BYTE && comps == 2) {
          GLuint i, j, k;
          const GLubyte (*rowA)[2] = (const GLubyte (*)[2]) srcRowA;
          const GLubyte (*rowB)[2] = (const GLubyte (*)[2]) srcRowB;
@@ -302,10 +438,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][1] = (rowA[j][1] + rowA[k][1] +
                          rowB[j][1] + rowB[k][1]) >> 2;
          }
-      }
-      return;
-   case MESA_FORMAT_RGB332:
-      {
+   }
+   else if (datatype == GL_UNSIGNED_BYTE_3_3_2 && comps == 3) {
          GLuint i, j, k;
          const GLubyte *rowA = (const GLubyte *) srcRowA;
          const GLubyte *rowB = (const GLubyte *) srcRowB;
@@ -329,16 +463,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             const GLint blue  = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
             dst[i] = (blue << 5) | (green << 2) | red;
          }
-      }
-      return;
-   case MESA_FORMAT_A8:
-   case MESA_FORMAT_L8:
-   case MESA_FORMAT_I8:
-   case MESA_FORMAT_CI8:
-#if FEATURE_EXT_texture_sRGB
-   case MESA_FORMAT_SL8:
-#endif
-      {
+   }
+   else if (datatype == GL_UNSIGNED_BYTE && comps == 1) {
          GLuint i, j, k;
          const GLubyte *rowA = (const GLubyte *) srcRowA;
          const GLubyte *rowB = (const GLubyte *) srcRowB;
@@ -347,10 +473,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
               i++, j += colStride, k += colStride) {
             dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) >> 2;
          }
-      }
-      return;
-   case MESA_FORMAT_RGBA_FLOAT32:
-      {
+   }
+   else if (datatype == GL_FLOAT && comps == 4) {
          GLuint i, j, k;
          const GLfloat (*rowA)[4] = (const GLfloat (*)[4]) srcRowA;
          const GLfloat (*rowB)[4] = (const GLfloat (*)[4]) srcRowB;
@@ -366,10 +490,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][3] = (rowA[j][3] + rowA[k][3] +
                          rowB[j][3] + rowB[k][3]) * 0.25F;
          }
-      }
-      return;
-   case MESA_FORMAT_RGBA_FLOAT16:
-      {
+   }
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 4) {
          GLuint i, j, k, comp;
          const GLhalfARB (*rowA)[4] = (const GLhalfARB (*)[4]) srcRowA;
          const GLhalfARB (*rowB)[4] = (const GLhalfARB (*)[4]) srcRowB;
@@ -385,10 +507,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
                dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
             }
          }
-      }
-      return;
-   case MESA_FORMAT_RGB_FLOAT32:
-      {
+   }
+   else if (datatype == GL_FLOAT && comps == 3) {
          GLuint i, j, k;
          const GLfloat (*rowA)[3] = (const GLfloat (*)[3]) srcRowA;
          const GLfloat (*rowB)[3] = (const GLfloat (*)[3]) srcRowB;
@@ -402,10 +522,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][2] = (rowA[j][2] + rowA[k][2] +
                          rowB[j][2] + rowB[k][2]) * 0.25F;
          }
-      }
-      return;
-   case MESA_FORMAT_RGB_FLOAT16:
-      {
+   }
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 3) {
          GLuint i, j, k, comp;
          const GLhalfARB (*rowA)[3] = (const GLhalfARB (*)[3]) srcRowA;
          const GLhalfARB (*rowB)[3] = (const GLhalfARB (*)[3]) srcRowB;
@@ -421,10 +539,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
                dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
             }
          }
-      }
-      return;
-   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
-      {
+   }
+   else if (datatype == GL_FLOAT && comps == 2) {
          GLuint i, j, k;
          const GLfloat (*rowA)[2] = (const GLfloat (*)[2]) srcRowA;
          const GLfloat (*rowB)[2] = (const GLfloat (*)[2]) srcRowB;
@@ -436,10 +552,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             dst[i][1] = (rowA[j][1] + rowA[k][1] +
                          rowB[j][1] + rowB[k][1]) * 0.25F;
          }
-      }
-      return;
-   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
-      {
+   }
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 2) {
          GLuint i, j, k, comp;
          const GLhalfARB (*rowA)[2] = (const GLhalfARB (*)[2]) srcRowA;
          const GLhalfARB (*rowB)[2] = (const GLhalfARB (*)[2]) srcRowB;
@@ -455,12 +569,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
                dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
             }
          }
-      }
-      return;
-   case MESA_FORMAT_ALPHA_FLOAT32:
-   case MESA_FORMAT_LUMINANCE_FLOAT32:
-   case MESA_FORMAT_INTENSITY_FLOAT32:
-      {
+   }
+   else if (datatype == GL_FLOAT && comps == 1) {
          GLuint i, j, k;
          const GLfloat *rowA = (const GLfloat *) srcRowA;
          const GLfloat *rowB = (const GLfloat *) srcRowB;
@@ -469,12 +579,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
               i++, j += colStride, k += colStride) {
             dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) * 0.25F;
          }
-      }
-      return;
-   case MESA_FORMAT_ALPHA_FLOAT16:
-   case MESA_FORMAT_LUMINANCE_FLOAT16:
-   case MESA_FORMAT_INTENSITY_FLOAT16:
-      {
+   }
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 1) {
          GLuint i, j, k;
          const GLhalfARB *rowA = (const GLhalfARB *) srcRowA;
          const GLhalfARB *rowB = (const GLhalfARB *) srcRowB;
@@ -488,10 +594,8 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
             bk = _mesa_half_to_float(rowB[k]);
             dst[i] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
          }
-      }
-      return;
-
-   default:
+   }
+   else {
       _mesa_problem(NULL, "bad format in do_row()");
    }
 }
@@ -504,11 +608,11 @@ do_row(const struct gl_texture_format *format, GLint srcWidth,
  */
 
 static void
-make_1d_mipmap(const struct gl_texture_format *format, GLint border,
+make_1d_mipmap(GLenum datatype, GLuint comps, GLint border,
                GLint srcWidth, const GLubyte *srcPtr,
                GLint dstWidth, GLubyte *dstPtr)
 {
-   const GLint bpt = format->TexelBytes;
+   const GLint bpt = bytes_per_pixel(datatype, comps);
    const GLubyte *src;
    GLubyte *dst;
 
@@ -517,7 +621,7 @@ make_1d_mipmap(const struct gl_texture_format *format, GLint border,
    dst = dstPtr + border * bpt;
 
    /* we just duplicate the input row, kind of hack, saves code */
-   do_row(format, srcWidth - 2 * border, src, src,
+   do_row(datatype, comps, srcWidth - 2 * border, src, src,
           dstWidth - 2 * border, dst);
 
    if (border) {
@@ -535,11 +639,11 @@ make_1d_mipmap(const struct gl_texture_format *format, GLint border,
  * XXX need to use the tex image's row stride!
  */
 static void
-make_2d_mipmap(const struct gl_texture_format *format, GLint border,
+make_2d_mipmap(GLenum datatype, GLuint comps, GLint border,
                GLint srcWidth, GLint srcHeight, const GLubyte *srcPtr,
                GLint dstWidth, GLint dstHeight, GLubyte *dstPtr)
 {
-   const GLint bpt = format->TexelBytes;
+   const GLint bpt = bytes_per_pixel(datatype, comps);
    const GLint srcWidthNB = srcWidth - 2 * border;  /* sizes w/out border */
    const GLint dstWidthNB = dstWidth - 2 * border;
    const GLint dstHeightNB = dstHeight - 2 * border;
@@ -558,7 +662,7 @@ make_2d_mipmap(const struct gl_texture_format *format, GLint border,
    dst = dstPtr + border * ((dstWidth + 1) * bpt);
 
    for (row = 0; row < dstHeightNB; row++) {
-      do_row(format, srcWidthNB, srcA, srcB,
+      do_row(datatype, comps, srcWidthNB, srcA, srcB,
              dstWidthNB, dst);
       srcA += 2 * srcRowStride;
       srcB += 2 * srcRowStride;
@@ -580,12 +684,12 @@ make_2d_mipmap(const struct gl_texture_format *format, GLint border,
       MEMCPY(dstPtr + (dstWidth * dstHeight - 1) * bpt,
              srcPtr + (srcWidth * srcHeight - 1) * bpt, bpt);
       /* lower border */
-      do_row(format, srcWidthNB,
+      do_row(datatype, comps, srcWidthNB,
              srcPtr + bpt,
              srcPtr + bpt,
              dstWidthNB, dstPtr + bpt);
       /* upper border */
-      do_row(format, srcWidthNB,
+      do_row(datatype, comps, srcWidthNB,
              srcPtr + (srcWidth * (srcHeight - 1) + 1) * bpt,
              srcPtr + (srcWidth * (srcHeight - 1) + 1) * bpt,
              dstWidthNB,
@@ -603,11 +707,11 @@ make_2d_mipmap(const struct gl_texture_format *format, GLint border,
       else {
          /* average two src pixels each dest pixel */
          for (row = 0; row < dstHeightNB; row += 2) {
-            do_row(format, 1,
+            do_row(datatype, comps, 1,
                    srcPtr + (srcWidth * (row * 2 + 1)) * bpt,
                    srcPtr + (srcWidth * (row * 2 + 2)) * bpt,
                    1, dstPtr + (dstWidth * row + 1) * bpt);
-            do_row(format, 1,
+            do_row(datatype, comps, 1,
                    srcPtr + (srcWidth * (row * 2 + 1) + srcWidth - 1) * bpt,
                    srcPtr + (srcWidth * (row * 2 + 2) + srcWidth - 1) * bpt,
                    1, dstPtr + (dstWidth * row + 1 + dstWidth - 1) * bpt);
@@ -618,13 +722,13 @@ make_2d_mipmap(const struct gl_texture_format *format, GLint border,
 
 
 static void
-make_3d_mipmap(const struct gl_texture_format *format, GLint border,
+make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
                GLint srcWidth, GLint srcHeight, GLint srcDepth,
                const GLubyte *srcPtr,
                GLint dstWidth, GLint dstHeight, GLint dstDepth,
                GLubyte *dstPtr)
 {
-   const GLint bpt = format->TexelBytes;
+   const GLint bpt = bytes_per_pixel(datatype, comps);
    const GLint srcWidthNB = srcWidth - 2 * border;  /* sizes w/out border */
    const GLint srcDepthNB = srcDepth - 2 * border;
    const GLint dstWidthNB = dstWidth - 2 * border;
@@ -694,13 +798,13 @@ make_3d_mipmap(const struct gl_texture_format *format, GLint border,
 
       for (row = 0; row < dstHeightNB; row++) {
          /* Average together two rows from first src image */
-         do_row(format, srcWidthNB, srcImgARowA, srcImgARowB,
+         do_row(datatype, comps, srcWidthNB, srcImgARowA, srcImgARowB,
                 srcWidthNB, tmpRowA);
          /* Average together two rows from second src image */
-         do_row(format, srcWidthNB, srcImgBRowA, srcImgBRowB,
+         do_row(datatype, comps, srcWidthNB, srcImgBRowA, srcImgBRowB,
                 srcWidthNB, tmpRowB);
          /* Average together the temp rows to make the final row */
-         do_row(format, srcWidthNB, tmpRowA, tmpRowB,
+         do_row(datatype, comps, srcWidthNB, tmpRowA, tmpRowB,
                 dstWidthNB, dstImgRow);
          /* advance to next rows */
          srcImgARowA += bytesPerSrcRow + srcRowOffset;
@@ -717,10 +821,10 @@ make_3d_mipmap(const struct gl_texture_format *format, GLint border,
    /* Luckily we can leverage the make_2d_mipmap() function here! */
    if (border > 0) {
       /* do front border image */
-      make_2d_mipmap(format, 1, srcWidth, srcHeight, srcPtr,
+      make_2d_mipmap(datatype, comps, 1, srcWidth, srcHeight, srcPtr,
                      dstWidth, dstHeight, dstPtr);
       /* do back border image */
-      make_2d_mipmap(format, 1, srcWidth, srcHeight,
+      make_2d_mipmap(datatype, comps, 1, srcWidth, srcHeight,
                      srcPtr + bytesPerSrcImage * (srcDepth - 1),
                      dstWidth, dstHeight,
                      dstPtr + bytesPerDstImage * (dstDepth - 1));
@@ -768,28 +872,28 @@ make_3d_mipmap(const struct gl_texture_format *format, GLint border,
             /* do border along [img][row=0][col=0] */
             src = srcPtr + (img * 2 + 1) * bytesPerSrcImage;
             dst = dstPtr + (img + 1) * bytesPerDstImage;
-            do_row(format, 1, src, src + srcImageOffset, 1, dst);
+            do_row(datatype, comps, 1, src, src + srcImageOffset, 1, dst);
 
             /* do border along [img][row=dstHeight-1][col=0] */
             src = srcPtr + (img * 2 + 1) * bytesPerSrcImage
                          + (srcHeight - 1) * bytesPerSrcRow;
             dst = dstPtr + (img + 1) * bytesPerDstImage
                          + (dstHeight - 1) * bytesPerDstRow;
-            do_row(format, 1, src, src + srcImageOffset, 1, dst);
+            do_row(datatype, comps, 1, src, src + srcImageOffset, 1, dst);
 
             /* do border along [img][row=0][col=dstWidth-1] */
             src = srcPtr + (img * 2 + 1) * bytesPerSrcImage
                          + (srcWidth - 1) * bpt;
             dst = dstPtr + (img + 1) * bytesPerDstImage
                          + (dstWidth - 1) * bpt;
-            do_row(format, 1, src, src + srcImageOffset, 1, dst);
+            do_row(datatype, comps, 1, src, src + srcImageOffset, 1, dst);
 
             /* do border along [img][row=dstHeight-1][col=dstWidth-1] */
             src = srcPtr + (img * 2 + 1) * bytesPerSrcImage
                          + (bytesPerSrcImage - bpt);
             dst = dstPtr + (img + 1) * bytesPerDstImage
                          + (bytesPerDstImage - bpt);
-            do_row(format, 1, src, src + srcImageOffset, 1, dst);
+            do_row(datatype, comps, 1, src, src + srcImageOffset, 1, dst);
          }
       }
    }
@@ -797,11 +901,11 @@ make_3d_mipmap(const struct gl_texture_format *format, GLint border,
 
 
 static void
-make_1d_stack_mipmap(const struct gl_texture_format *format, GLint border,
+make_1d_stack_mipmap(GLenum datatype, GLuint comps, GLint border,
                      GLint srcWidth, const GLubyte *srcPtr,
                      GLint dstWidth, GLint dstHeight, GLubyte *dstPtr)
 {
-   const GLint bpt = format->TexelBytes;
+   const GLint bpt = bytes_per_pixel(datatype, comps);
    const GLint srcWidthNB = srcWidth - 2 * border;  /* sizes w/out border */
    const GLint dstWidthNB = dstWidth - 2 * border;
    const GLint dstHeightNB = dstHeight - 2 * border;
@@ -816,7 +920,7 @@ make_1d_stack_mipmap(const struct gl_texture_format *format, GLint border,
    dst = dstPtr + border * ((dstWidth + 1) * bpt);
 
    for (row = 0; row < dstHeightNB; row++) {
-      do_row(format, srcWidthNB, src, src,
+      do_row(datatype, comps, srcWidthNB, src, src,
              dstWidthNB, dst);
       src += srcRowStride;
       dst += dstRowStride;
@@ -839,12 +943,12 @@ make_1d_stack_mipmap(const struct gl_texture_format *format, GLint border,
  * and \c make_2d_mipmap.
  */
 static void
-make_2d_stack_mipmap(const struct gl_texture_format *format, GLint border,
+make_2d_stack_mipmap(GLenum datatype, GLuint comps, GLint border,
                      GLint srcWidth, GLint srcHeight, const GLubyte *srcPtr,
                      GLint dstWidth, GLint dstHeight, GLint dstDepth,
                      GLubyte *dstPtr)
 {
-   const GLint bpt = format->TexelBytes;
+   const GLint bpt = bytes_per_pixel(datatype, comps);
    const GLint srcWidthNB = srcWidth - 2 * border;  /* sizes w/out border */
    const GLint dstWidthNB = dstWidth - 2 * border;
    const GLint dstHeightNB = dstHeight - 2 * border;
@@ -866,7 +970,7 @@ make_2d_stack_mipmap(const struct gl_texture_format *format, GLint border,
 
    for (layer = 0; layer < dstDepthNB; layer++) {
       for (row = 0; row < dstHeightNB; row++) {
-         do_row(format, srcWidthNB, srcA, srcB,
+         do_row(datatype, comps, srcWidthNB, srcA, srcB,
                 dstWidthNB, dst);
          srcA += 2 * srcRowStride;
          srcB += 2 * srcRowStride;
@@ -888,12 +992,12 @@ make_2d_stack_mipmap(const struct gl_texture_format *format, GLint border,
          MEMCPY(dstPtr + (dstWidth * dstHeight - 1) * bpt,
                 srcPtr + (srcWidth * srcHeight - 1) * bpt, bpt);
          /* lower border */
-         do_row(format, srcWidthNB,
+         do_row(datatype, comps, srcWidthNB,
                 srcPtr + bpt,
                 srcPtr + bpt,
                 dstWidthNB, dstPtr + bpt);
          /* upper border */
-         do_row(format, srcWidthNB,
+         do_row(datatype, comps, srcWidthNB,
                 srcPtr + (srcWidth * (srcHeight - 1) + 1) * bpt,
                 srcPtr + (srcWidth * (srcHeight - 1) + 1) * bpt,
                 dstWidthNB,
@@ -911,11 +1015,11 @@ make_2d_stack_mipmap(const struct gl_texture_format *format, GLint border,
          else {
             /* average two src pixels each dest pixel */
             for (row = 0; row < dstHeightNB; row += 2) {
-               do_row(format, 1,
+               do_row(datatype, comps, 1,
                       srcPtr + (srcWidth * (row * 2 + 1)) * bpt,
                       srcPtr + (srcWidth * (row * 2 + 2)) * bpt,
                       1, dstPtr + (dstWidth * row + 1) * bpt);
-               do_row(format, 1,
+               do_row(datatype, comps, 1,
                       srcPtr + (srcWidth * (row * 2 + 1) + srcWidth - 1) * bpt,
                       srcPtr + (srcWidth * (row * 2 + 2) + srcWidth - 1) * bpt,
                       1, dstPtr + (dstWidth * row + 1 + dstWidth - 1) * bpt);
@@ -940,6 +1044,8 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
    const GLubyte *srcData = NULL;
    GLubyte *dstData = NULL;
    GLint level, maxLevels;
+   GLenum datatype;
+   GLuint comps;
 
    ASSERT(texObj);
    /* XXX choose cube map face here??? */
@@ -1002,6 +1108,8 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
       convertFormat = srcImage->TexFormat;
    }
 
+   mesa_format_to_type_and_comps(convertFormat, &datatype, &comps);
+
    for (level = texObj->BaseLevel; level < texObj->MaxLevel
            && level < maxLevels - 1; level++) {
       /* generate image[level+1] from image[level] */
@@ -1118,7 +1226,7 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
        */
       switch (target) {
          case GL_TEXTURE_1D:
-            make_1d_mipmap(convertFormat, border,
+            make_1d_mipmap(datatype, comps, border,
                            srcWidth, srcData,
                            dstWidth, dstData);
             break;
@@ -1129,22 +1237,22 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
          case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_ARB:
          case GL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB:
          case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB:
-            make_2d_mipmap(convertFormat, border,
+            make_2d_mipmap(datatype, comps, border,
                            srcWidth, srcHeight, srcData,
                            dstWidth, dstHeight, dstData);
             break;
          case GL_TEXTURE_3D:
-            make_3d_mipmap(convertFormat, border,
+            make_3d_mipmap(datatype, comps, border,
                            srcWidth, srcHeight, srcDepth, srcData,
                            dstWidth, dstHeight, dstDepth, dstData);
             break;
          case GL_TEXTURE_1D_ARRAY_EXT:
-            make_1d_stack_mipmap(convertFormat, border,
+            make_1d_stack_mipmap(datatype, comps, border,
                                  srcWidth, srcData,
                                  dstWidth, dstHeight, dstData);
             break;
          case GL_TEXTURE_2D_ARRAY_EXT:
-            make_2d_stack_mipmap(convertFormat, border,
+            make_2d_stack_mipmap(datatype, comps, border,
                                  srcWidth, srcHeight, srcData,
                                  dstWidth, dstHeight, dstDepth, dstData);
             break;
-- 
cgit v1.2.3


From 9b3b230bf6b5833df65d706b68e887ae3bdcf950 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 16:38:28 -0700
Subject: checkpoint- consolidation in do_row()

---
 src/mesa/main/mipmap.c | 42 ++++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 013dc3752e..c5f1c5bcbe 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -234,11 +234,11 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
    assert(srcWidth == dstWidth || srcWidth == 2 * dstWidth);
    */
 
-   if (datatype == CHAN_TYPE && comps == 4) {
+   if (datatype == GL_UNSIGNED_SHORT && comps == 4) {
          GLuint i, j, k;
-         const GLchan (*rowA)[4] = (const GLchan (*)[4]) srcRowA;
-         const GLchan (*rowB)[4] = (const GLchan (*)[4]) srcRowB;
-         GLchan (*dst)[4] = (GLchan (*)[4]) dstRow;
+         const GLushort (*rowA)[4] = (const GLushort (*)[4]) srcRowA;
+         const GLushort (*rowB)[4] = (const GLushort (*)[4]) srcRowB;
+         GLushort (*dst)[4] = (GLushort (*)[4]) dstRow;
          for (i = j = 0, k = k0; i < (GLuint) dstWidth;
               i++, j += colStride, k += colStride) {
             dst[i][0] = (rowA[j][0] + rowA[k][0] +
@@ -251,11 +251,11 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
                          rowB[j][3] + rowB[k][3]) / 4;
          }
    }
-   else if (datatype == CHAN_TYPE && comps == 3) {
+   else if (datatype == GL_UNSIGNED_SHORT && comps == 3) {
          GLuint i, j, k;
-         const GLchan (*rowA)[3] = (const GLchan (*)[3]) srcRowA;
-         const GLchan (*rowB)[3] = (const GLchan (*)[3]) srcRowB;
-         GLchan (*dst)[3] = (GLchan (*)[3]) dstRow;
+         const GLushort (*rowA)[3] = (const GLushort (*)[3]) srcRowA;
+         const GLushort (*rowB)[3] = (const GLushort (*)[3]) srcRowB;
+         GLushort (*dst)[3] = (GLushort (*)[3]) dstRow;
          for (i = j = 0, k = k0; i < (GLuint) dstWidth;
               i++, j += colStride, k += colStride) {
             dst[i][0] = (rowA[j][0] + rowA[k][0] +
@@ -266,21 +266,21 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
                          rowB[j][2] + rowB[k][2]) / 4;
          }
    }
-   else if (datatype == CHAN_TYPE && comps == 1) {
+   else if (datatype == GL_UNSIGNED_SHORT && comps == 1) {
          GLuint i, j, k;
-         const GLchan *rowA = (const GLchan *) srcRowA;
-         const GLchan *rowB = (const GLchan *) srcRowB;
-         GLchan *dst = (GLchan *) dstRow;
+         const GLushort *rowA = (const GLushort *) srcRowA;
+         const GLushort *rowB = (const GLushort *) srcRowB;
+         GLushort *dst = (GLushort *) dstRow;
          for (i = j = 0, k = k0; i < (GLuint) dstWidth;
               i++, j += colStride, k += colStride) {
             dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
          }
    }
-   else if (datatype == CHAN_TYPE && comps == 2) {
+   else if (datatype == GL_UNSIGNED_SHORT && comps == 2) {
          GLuint i, j, k;
-         const GLchan (*rowA)[2] = (const GLchan (*)[2]) srcRowA;
-         const GLchan (*rowB)[2] = (const GLchan (*)[2]) srcRowB;
-         GLchan (*dst)[2] = (GLchan (*)[2]) dstRow;
+         const GLushort (*rowA)[2] = (const GLushort (*)[2]) srcRowA;
+         const GLushort (*rowB)[2] = (const GLushort (*)[2]) srcRowB;
+         GLushort (*dst)[2] = (GLushort (*)[2]) dstRow;
          for (i = j = 0, k = k0; i < (GLuint) dstWidth;
               i++, j += colStride, k += colStride) {
             dst[i][0] = (rowA[j][0] + rowA[k][0] +
@@ -299,16 +299,6 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
             dst[i] = rowA[j] / 4 + rowA[k] / 4 + rowB[j] / 4 + rowB[k] / 4;
          }
    }
-   else if (datatype == GL_UNSIGNED_SHORT && comps == 1) {
-         GLuint i, j, k;
-         const GLushort *rowA = (const GLushort *) srcRowA;
-         const GLushort *rowB = (const GLushort *) srcRowB;
-         GLushort *dst = (GLushort *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
-         }
-   }
    else if (datatype == GL_UNSIGNED_BYTE && comps == 4) {
          GLuint i, j, k;
          const GLubyte (*rowA)[4] = (const GLubyte (*)[4]) srcRowA;
-- 
cgit v1.2.3


From 4bf4f6e029b85d9eb90f5649fc5635cf274e1fe1 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 16:40:39 -0700
Subject: re-indent do_row()

---
 src/mesa/main/mipmap.c | 594 ++++++++++++++++++++++++-------------------------
 1 file changed, 288 insertions(+), 306 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index c5f1c5bcbe..22c7530e83 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -235,355 +235,337 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
    */
 
    if (datatype == GL_UNSIGNED_SHORT && comps == 4) {
-         GLuint i, j, k;
-         const GLushort (*rowA)[4] = (const GLushort (*)[4]) srcRowA;
-         const GLushort (*rowB)[4] = (const GLushort (*)[4]) srcRowB;
-         GLushort (*dst)[4] = (GLushort (*)[4]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) / 4;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) / 4;
-            dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                         rowB[j][2] + rowB[k][2]) / 4;
-            dst[i][3] = (rowA[j][3] + rowA[k][3] +
-                         rowB[j][3] + rowB[k][3]) / 4;
-         }
+      GLuint i, j, k;
+      const GLushort(*rowA)[4] = (const GLushort(*)[4]) srcRowA;
+      const GLushort(*rowB)[4] = (const GLushort(*)[4]) srcRowB;
+      GLushort(*dst)[4] = (GLushort(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] + rowB[j][3] + rowB[k][3]) / 4;
+      }
    }
    else if (datatype == GL_UNSIGNED_SHORT && comps == 3) {
-         GLuint i, j, k;
-         const GLushort (*rowA)[3] = (const GLushort (*)[3]) srcRowA;
-         const GLushort (*rowB)[3] = (const GLushort (*)[3]) srcRowB;
-         GLushort (*dst)[3] = (GLushort (*)[3]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) / 4;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) / 4;
-            dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                         rowB[j][2] + rowB[k][2]) / 4;
-         }
+      GLuint i, j, k;
+      const GLushort(*rowA)[3] = (const GLushort(*)[3]) srcRowA;
+      const GLushort(*rowB)[3] = (const GLushort(*)[3]) srcRowB;
+      GLushort(*dst)[3] = (GLushort(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+      }
    }
    else if (datatype == GL_UNSIGNED_SHORT && comps == 1) {
-         GLuint i, j, k;
-         const GLushort *rowA = (const GLushort *) srcRowA;
-         const GLushort *rowB = (const GLushort *) srcRowB;
-         GLushort *dst = (GLushort *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
-         }
+      GLuint i, j, k;
+      const GLushort *rowA = (const GLushort *) srcRowA;
+      const GLushort *rowB = (const GLushort *) srcRowB;
+      GLushort *dst = (GLushort *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
+      }
    }
    else if (datatype == GL_UNSIGNED_SHORT && comps == 2) {
-         GLuint i, j, k;
-         const GLushort (*rowA)[2] = (const GLushort (*)[2]) srcRowA;
-         const GLushort (*rowB)[2] = (const GLushort (*)[2]) srcRowB;
-         GLushort (*dst)[2] = (GLushort (*)[2]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) / 4;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) / 4;
-         }
+      GLuint i, j, k;
+      const GLushort(*rowA)[2] = (const GLushort(*)[2]) srcRowA;
+      const GLushort(*rowB)[2] = (const GLushort(*)[2]) srcRowB;
+      GLushort(*dst)[2] = (GLushort(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+      }
    }
    else if (datatype == GL_UNSIGNED_INT && comps == 1) {
-         GLuint i, j, k;
-         const GLuint *rowA = (const GLuint *) srcRowA;
-         const GLuint *rowB = (const GLuint *) srcRowB;
-         GLfloat *dst = (GLfloat *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i] = rowA[j] / 4 + rowA[k] / 4 + rowB[j] / 4 + rowB[k] / 4;
-         }
+      GLuint i, j, k;
+      const GLuint *rowA = (const GLuint *) srcRowA;
+      const GLuint *rowB = (const GLuint *) srcRowB;
+      GLfloat *dst = (GLfloat *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = rowA[j] / 4 + rowA[k] / 4 + rowB[j] / 4 + rowB[k] / 4;
+      }
    }
    else if (datatype == GL_UNSIGNED_BYTE && comps == 4) {
-         GLuint i, j, k;
-         const GLubyte (*rowA)[4] = (const GLubyte (*)[4]) srcRowA;
-         const GLubyte (*rowB)[4] = (const GLubyte (*)[4]) srcRowB;
-         GLubyte (*dst)[4] = (GLubyte (*)[4]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) / 4;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) / 4;
-            dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                         rowB[j][2] + rowB[k][2]) / 4;
-            dst[i][3] = (rowA[j][3] + rowA[k][3] +
-                         rowB[j][3] + rowB[k][3]) / 4;
-         }
+      GLuint i, j, k;
+      const GLubyte(*rowA)[4] = (const GLubyte(*)[4]) srcRowA;
+      const GLubyte(*rowB)[4] = (const GLubyte(*)[4]) srcRowB;
+      GLubyte(*dst)[4] = (GLubyte(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] + rowB[j][3] + rowB[k][3]) / 4;
+      }
    }
    else if (datatype == GL_UNSIGNED_BYTE && comps == 3) {
-         GLuint i, j, k;
-         const GLubyte (*rowA)[3] = (const GLubyte (*)[3]) srcRowA;
-         const GLubyte (*rowB)[3] = (const GLubyte (*)[3]) srcRowB;
-         GLubyte (*dst)[3] = (GLubyte (*)[3]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) / 4;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) / 4;
-            dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                         rowB[j][2] + rowB[k][2]) / 4;
-         }
+      GLuint i, j, k;
+      const GLubyte(*rowA)[3] = (const GLubyte(*)[3]) srcRowA;
+      const GLubyte(*rowB)[3] = (const GLubyte(*)[3]) srcRowB;
+      GLubyte(*dst)[3] = (GLubyte(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+      }
    }
    else if (datatype == GL_UNSIGNED_SHORT_5_6_5 && comps == 3) {
-         GLuint i, j, k;
-         const GLushort *rowA = (const GLushort *) srcRowA;
-         const GLushort *rowB = (const GLushort *) srcRowB;
-         GLushort *dst = (GLushort *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            const GLint rowAr0 = rowA[j] & 0x1f;
-            const GLint rowAr1 = rowA[k] & 0x1f;
-            const GLint rowBr0 = rowB[j] & 0x1f;
-            const GLint rowBr1 = rowB[k] & 0x1f;
-            const GLint rowAg0 = (rowA[j] >> 5) & 0x3f;
-            const GLint rowAg1 = (rowA[k] >> 5) & 0x3f;
-            const GLint rowBg0 = (rowB[j] >> 5) & 0x3f;
-            const GLint rowBg1 = (rowB[k] >> 5) & 0x3f;
-            const GLint rowAb0 = (rowA[j] >> 11) & 0x1f;
-            const GLint rowAb1 = (rowA[k] >> 11) & 0x1f;
-            const GLint rowBb0 = (rowB[j] >> 11) & 0x1f;
-            const GLint rowBb1 = (rowB[k] >> 11) & 0x1f;
-            const GLint red   = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
-            const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
-            const GLint blue  = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
-            dst[i] = (blue << 11) | (green << 5) | red;
-         }
+      GLuint i, j, k;
+      const GLushort *rowA = (const GLushort *) srcRowA;
+      const GLushort *rowB = (const GLushort *) srcRowB;
+      GLushort *dst = (GLushort *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const GLint rowAr0 = rowA[j] & 0x1f;
+         const GLint rowAr1 = rowA[k] & 0x1f;
+         const GLint rowBr0 = rowB[j] & 0x1f;
+         const GLint rowBr1 = rowB[k] & 0x1f;
+         const GLint rowAg0 = (rowA[j] >> 5) & 0x3f;
+         const GLint rowAg1 = (rowA[k] >> 5) & 0x3f;
+         const GLint rowBg0 = (rowB[j] >> 5) & 0x3f;
+         const GLint rowBg1 = (rowB[k] >> 5) & 0x3f;
+         const GLint rowAb0 = (rowA[j] >> 11) & 0x1f;
+         const GLint rowAb1 = (rowA[k] >> 11) & 0x1f;
+         const GLint rowBb0 = (rowB[j] >> 11) & 0x1f;
+         const GLint rowBb1 = (rowB[k] >> 11) & 0x1f;
+         const GLint red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const GLint blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         dst[i] = (blue << 11) | (green << 5) | red;
+      }
    }
    else if (datatype == GL_UNSIGNED_SHORT_4_4_4_4 && comps == 4) {
-         GLuint i, j, k;
-         const GLushort *rowA = (const GLushort *) srcRowA;
-         const GLushort *rowB = (const GLushort *) srcRowB;
-         GLushort *dst = (GLushort *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            const GLint rowAr0 = rowA[j] & 0xf;
-            const GLint rowAr1 = rowA[k] & 0xf;
-            const GLint rowBr0 = rowB[j] & 0xf;
-            const GLint rowBr1 = rowB[k] & 0xf;
-            const GLint rowAg0 = (rowA[j] >> 4) & 0xf;
-            const GLint rowAg1 = (rowA[k] >> 4) & 0xf;
-            const GLint rowBg0 = (rowB[j] >> 4) & 0xf;
-            const GLint rowBg1 = (rowB[k] >> 4) & 0xf;
-            const GLint rowAb0 = (rowA[j] >> 8) & 0xf;
-            const GLint rowAb1 = (rowA[k] >> 8) & 0xf;
-            const GLint rowBb0 = (rowB[j] >> 8) & 0xf;
-            const GLint rowBb1 = (rowB[k] >> 8) & 0xf;
-            const GLint rowAa0 = (rowA[j] >> 12) & 0xf;
-            const GLint rowAa1 = (rowA[k] >> 12) & 0xf;
-            const GLint rowBa0 = (rowB[j] >> 12) & 0xf;
-            const GLint rowBa1 = (rowB[k] >> 12) & 0xf;
-            const GLint red   = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
-            const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
-            const GLint blue  = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
-            const GLint alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
-            dst[i] = (alpha << 12) | (blue << 8) | (green << 4) | red;
-         }
+      GLuint i, j, k;
+      const GLushort *rowA = (const GLushort *) srcRowA;
+      const GLushort *rowB = (const GLushort *) srcRowB;
+      GLushort *dst = (GLushort *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const GLint rowAr0 = rowA[j] & 0xf;
+         const GLint rowAr1 = rowA[k] & 0xf;
+         const GLint rowBr0 = rowB[j] & 0xf;
+         const GLint rowBr1 = rowB[k] & 0xf;
+         const GLint rowAg0 = (rowA[j] >> 4) & 0xf;
+         const GLint rowAg1 = (rowA[k] >> 4) & 0xf;
+         const GLint rowBg0 = (rowB[j] >> 4) & 0xf;
+         const GLint rowBg1 = (rowB[k] >> 4) & 0xf;
+         const GLint rowAb0 = (rowA[j] >> 8) & 0xf;
+         const GLint rowAb1 = (rowA[k] >> 8) & 0xf;
+         const GLint rowBb0 = (rowB[j] >> 8) & 0xf;
+         const GLint rowBb1 = (rowB[k] >> 8) & 0xf;
+         const GLint rowAa0 = (rowA[j] >> 12) & 0xf;
+         const GLint rowAa1 = (rowA[k] >> 12) & 0xf;
+         const GLint rowBa0 = (rowB[j] >> 12) & 0xf;
+         const GLint rowBa1 = (rowB[k] >> 12) & 0xf;
+         const GLint red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const GLint blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         const GLint alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
+         dst[i] = (alpha << 12) | (blue << 8) | (green << 4) | red;
+      }
    }
    else if (datatype == GL_UNSIGNED_SHORT_1_5_5_5_REV && comps == 4) {
-         GLuint i, j, k;
-         const GLushort *rowA = (const GLushort *) srcRowA;
-         const GLushort *rowB = (const GLushort *) srcRowB;
-         GLushort *dst = (GLushort *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            const GLint rowAr0 = rowA[j] & 0x1f;
-            const GLint rowAr1 = rowA[k] & 0x1f;
-            const GLint rowBr0 = rowB[j] & 0x1f;
-            const GLint rowBr1 = rowB[k] & 0xf;
-            const GLint rowAg0 = (rowA[j] >> 5) & 0x1f;
-            const GLint rowAg1 = (rowA[k] >> 5) & 0x1f;
-            const GLint rowBg0 = (rowB[j] >> 5) & 0x1f;
-            const GLint rowBg1 = (rowB[k] >> 5) & 0x1f;
-            const GLint rowAb0 = (rowA[j] >> 10) & 0x1f;
-            const GLint rowAb1 = (rowA[k] >> 10) & 0x1f;
-            const GLint rowBb0 = (rowB[j] >> 10) & 0x1f;
-            const GLint rowBb1 = (rowB[k] >> 10) & 0x1f;
-            const GLint rowAa0 = (rowA[j] >> 15) & 0x1;
-            const GLint rowAa1 = (rowA[k] >> 15) & 0x1;
-            const GLint rowBa0 = (rowB[j] >> 15) & 0x1;
-            const GLint rowBa1 = (rowB[k] >> 15) & 0x1;
-            const GLint red   = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
-            const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
-            const GLint blue  = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
-            const GLint alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
-            dst[i] = (alpha << 15) | (blue << 10) | (green << 5) | red;
-         }
+      GLuint i, j, k;
+      const GLushort *rowA = (const GLushort *) srcRowA;
+      const GLushort *rowB = (const GLushort *) srcRowB;
+      GLushort *dst = (GLushort *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const GLint rowAr0 = rowA[j] & 0x1f;
+         const GLint rowAr1 = rowA[k] & 0x1f;
+         const GLint rowBr0 = rowB[j] & 0x1f;
+         const GLint rowBr1 = rowB[k] & 0xf;
+         const GLint rowAg0 = (rowA[j] >> 5) & 0x1f;
+         const GLint rowAg1 = (rowA[k] >> 5) & 0x1f;
+         const GLint rowBg0 = (rowB[j] >> 5) & 0x1f;
+         const GLint rowBg1 = (rowB[k] >> 5) & 0x1f;
+         const GLint rowAb0 = (rowA[j] >> 10) & 0x1f;
+         const GLint rowAb1 = (rowA[k] >> 10) & 0x1f;
+         const GLint rowBb0 = (rowB[j] >> 10) & 0x1f;
+         const GLint rowBb1 = (rowB[k] >> 10) & 0x1f;
+         const GLint rowAa0 = (rowA[j] >> 15) & 0x1;
+         const GLint rowAa1 = (rowA[k] >> 15) & 0x1;
+         const GLint rowBa0 = (rowB[j] >> 15) & 0x1;
+         const GLint rowBa1 = (rowB[k] >> 15) & 0x1;
+         const GLint red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const GLint blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         const GLint alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
+         dst[i] = (alpha << 15) | (blue << 10) | (green << 5) | red;
+      }
    }
    else if (datatype == GL_UNSIGNED_BYTE && comps == 2) {
-         GLuint i, j, k;
-         const GLubyte (*rowA)[2] = (const GLubyte (*)[2]) srcRowA;
-         const GLubyte (*rowB)[2] = (const GLubyte (*)[2]) srcRowB;
-         GLubyte (*dst)[2] = (GLubyte (*)[2]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) >> 2;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) >> 2;
-         }
+      GLuint i, j, k;
+      const GLubyte(*rowA)[2] = (const GLubyte(*)[2]) srcRowA;
+      const GLubyte(*rowB)[2] = (const GLubyte(*)[2]) srcRowB;
+      GLubyte(*dst)[2] = (GLubyte(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) >> 2;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) >> 2;
+      }
    }
    else if (datatype == GL_UNSIGNED_BYTE_3_3_2 && comps == 3) {
-         GLuint i, j, k;
-         const GLubyte *rowA = (const GLubyte *) srcRowA;
-         const GLubyte *rowB = (const GLubyte *) srcRowB;
-         GLubyte *dst = (GLubyte *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            const GLint rowAr0 = rowA[j] & 0x3;
-            const GLint rowAr1 = rowA[k] & 0x3;
-            const GLint rowBr0 = rowB[j] & 0x3;
-            const GLint rowBr1 = rowB[k] & 0x3;
-            const GLint rowAg0 = (rowA[j] >> 2) & 0x7;
-            const GLint rowAg1 = (rowA[k] >> 2) & 0x7;
-            const GLint rowBg0 = (rowB[j] >> 2) & 0x7;
-            const GLint rowBg1 = (rowB[k] >> 2) & 0x7;
-            const GLint rowAb0 = (rowA[j] >> 5) & 0x7;
-            const GLint rowAb1 = (rowA[k] >> 5) & 0x7;
-            const GLint rowBb0 = (rowB[j] >> 5) & 0x7;
-            const GLint rowBb1 = (rowB[k] >> 5) & 0x7;
-            const GLint red   = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
-            const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
-            const GLint blue  = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
-            dst[i] = (blue << 5) | (green << 2) | red;
-         }
+      GLuint i, j, k;
+      const GLubyte *rowA = (const GLubyte *) srcRowA;
+      const GLubyte *rowB = (const GLubyte *) srcRowB;
+      GLubyte *dst = (GLubyte *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const GLint rowAr0 = rowA[j] & 0x3;
+         const GLint rowAr1 = rowA[k] & 0x3;
+         const GLint rowBr0 = rowB[j] & 0x3;
+         const GLint rowBr1 = rowB[k] & 0x3;
+         const GLint rowAg0 = (rowA[j] >> 2) & 0x7;
+         const GLint rowAg1 = (rowA[k] >> 2) & 0x7;
+         const GLint rowBg0 = (rowB[j] >> 2) & 0x7;
+         const GLint rowBg1 = (rowB[k] >> 2) & 0x7;
+         const GLint rowAb0 = (rowA[j] >> 5) & 0x7;
+         const GLint rowAb1 = (rowA[k] >> 5) & 0x7;
+         const GLint rowBb0 = (rowB[j] >> 5) & 0x7;
+         const GLint rowBb1 = (rowB[k] >> 5) & 0x7;
+         const GLint red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const GLint green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const GLint blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         dst[i] = (blue << 5) | (green << 2) | red;
+      }
    }
    else if (datatype == GL_UNSIGNED_BYTE && comps == 1) {
-         GLuint i, j, k;
-         const GLubyte *rowA = (const GLubyte *) srcRowA;
-         const GLubyte *rowB = (const GLubyte *) srcRowB;
-         GLubyte *dst = (GLubyte *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) >> 2;
-         }
+      GLuint i, j, k;
+      const GLubyte *rowA = (const GLubyte *) srcRowA;
+      const GLubyte *rowB = (const GLubyte *) srcRowB;
+      GLubyte *dst = (GLubyte *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) >> 2;
+      }
    }
    else if (datatype == GL_FLOAT && comps == 4) {
-         GLuint i, j, k;
-         const GLfloat (*rowA)[4] = (const GLfloat (*)[4]) srcRowA;
-         const GLfloat (*rowB)[4] = (const GLfloat (*)[4]) srcRowB;
-         GLfloat (*dst)[4] = (GLfloat (*)[4]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) * 0.25F;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) * 0.25F;
-            dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                         rowB[j][2] + rowB[k][2]) * 0.25F;
-            dst[i][3] = (rowA[j][3] + rowA[k][3] +
-                         rowB[j][3] + rowB[k][3]) * 0.25F;
-         }
+      GLuint i, j, k;
+      const GLfloat(*rowA)[4] = (const GLfloat(*)[4]) srcRowA;
+      const GLfloat(*rowB)[4] = (const GLfloat(*)[4]) srcRowB;
+      GLfloat(*dst)[4] = (GLfloat(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] +
+                      rowB[j][2] + rowB[k][2]) * 0.25F;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] +
+                      rowB[j][3] + rowB[k][3]) * 0.25F;
+      }
    }
    else if (datatype == GL_HALF_FLOAT_ARB && comps == 4) {
-         GLuint i, j, k, comp;
-         const GLhalfARB (*rowA)[4] = (const GLhalfARB (*)[4]) srcRowA;
-         const GLhalfARB (*rowB)[4] = (const GLhalfARB (*)[4]) srcRowB;
-         GLhalfARB (*dst)[4] = (GLhalfARB (*)[4]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            for (comp = 0; comp < 4; comp++) {
-               GLfloat aj, ak, bj, bk;
-               aj = _mesa_half_to_float(rowA[j][comp]);
-               ak = _mesa_half_to_float(rowA[k][comp]);
-               bj = _mesa_half_to_float(rowB[j][comp]);
-               bk = _mesa_half_to_float(rowB[k][comp]);
-               dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-            }
+      GLuint i, j, k, comp;
+      const GLhalfARB(*rowA)[4] = (const GLhalfARB(*)[4]) srcRowA;
+      const GLhalfARB(*rowB)[4] = (const GLhalfARB(*)[4]) srcRowB;
+      GLhalfARB(*dst)[4] = (GLhalfARB(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 4; comp++) {
+            GLfloat aj, ak, bj, bk;
+            aj = _mesa_half_to_float(rowA[j][comp]);
+            ak = _mesa_half_to_float(rowA[k][comp]);
+            bj = _mesa_half_to_float(rowB[j][comp]);
+            bk = _mesa_half_to_float(rowB[k][comp]);
+            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
          }
+      }
    }
    else if (datatype == GL_FLOAT && comps == 3) {
-         GLuint i, j, k;
-         const GLfloat (*rowA)[3] = (const GLfloat (*)[3]) srcRowA;
-         const GLfloat (*rowB)[3] = (const GLfloat (*)[3]) srcRowB;
-         GLfloat (*dst)[3] = (GLfloat (*)[3]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) * 0.25F;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) * 0.25F;
-            dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                         rowB[j][2] + rowB[k][2]) * 0.25F;
-         }
+      GLuint i, j, k;
+      const GLfloat(*rowA)[3] = (const GLfloat(*)[3]) srcRowA;
+      const GLfloat(*rowB)[3] = (const GLfloat(*)[3]) srcRowB;
+      GLfloat(*dst)[3] = (GLfloat(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] +
+                      rowB[j][2] + rowB[k][2]) * 0.25F;
+      }
    }
    else if (datatype == GL_HALF_FLOAT_ARB && comps == 3) {
-         GLuint i, j, k, comp;
-         const GLhalfARB (*rowA)[3] = (const GLhalfARB (*)[3]) srcRowA;
-         const GLhalfARB (*rowB)[3] = (const GLhalfARB (*)[3]) srcRowB;
-         GLhalfARB (*dst)[3] = (GLhalfARB (*)[3]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            for (comp = 0; comp < 3; comp++) {
-               GLfloat aj, ak, bj, bk;
-               aj = _mesa_half_to_float(rowA[j][comp]);
-               ak = _mesa_half_to_float(rowA[k][comp]);
-               bj = _mesa_half_to_float(rowB[j][comp]);
-               bk = _mesa_half_to_float(rowB[k][comp]);
-               dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-            }
+      GLuint i, j, k, comp;
+      const GLhalfARB(*rowA)[3] = (const GLhalfARB(*)[3]) srcRowA;
+      const GLhalfARB(*rowB)[3] = (const GLhalfARB(*)[3]) srcRowB;
+      GLhalfARB(*dst)[3] = (GLhalfARB(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 3; comp++) {
+            GLfloat aj, ak, bj, bk;
+            aj = _mesa_half_to_float(rowA[j][comp]);
+            ak = _mesa_half_to_float(rowA[k][comp]);
+            bj = _mesa_half_to_float(rowB[j][comp]);
+            bk = _mesa_half_to_float(rowB[k][comp]);
+            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
          }
+      }
    }
    else if (datatype == GL_FLOAT && comps == 2) {
-         GLuint i, j, k;
-         const GLfloat (*rowA)[2] = (const GLfloat (*)[2]) srcRowA;
-         const GLfloat (*rowB)[2] = (const GLfloat (*)[2]) srcRowB;
-         GLfloat (*dst)[2] = (GLfloat (*)[2]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                         rowB[j][0] + rowB[k][0]) * 0.25F;
-            dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                         rowB[j][1] + rowB[k][1]) * 0.25F;
-         }
+      GLuint i, j, k;
+      const GLfloat(*rowA)[2] = (const GLfloat(*)[2]) srcRowA;
+      const GLfloat(*rowB)[2] = (const GLfloat(*)[2]) srcRowB;
+      GLfloat(*dst)[2] = (GLfloat(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+      }
    }
    else if (datatype == GL_HALF_FLOAT_ARB && comps == 2) {
-         GLuint i, j, k, comp;
-         const GLhalfARB (*rowA)[2] = (const GLhalfARB (*)[2]) srcRowA;
-         const GLhalfARB (*rowB)[2] = (const GLhalfARB (*)[2]) srcRowB;
-         GLhalfARB (*dst)[2] = (GLhalfARB (*)[2]) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            for (comp = 0; comp < 2; comp++) {
-               GLfloat aj, ak, bj, bk;
-               aj = _mesa_half_to_float(rowA[j][comp]);
-               ak = _mesa_half_to_float(rowA[k][comp]);
-               bj = _mesa_half_to_float(rowB[j][comp]);
-               bk = _mesa_half_to_float(rowB[k][comp]);
-               dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-            }
+      GLuint i, j, k, comp;
+      const GLhalfARB(*rowA)[2] = (const GLhalfARB(*)[2]) srcRowA;
+      const GLhalfARB(*rowB)[2] = (const GLhalfARB(*)[2]) srcRowB;
+      GLhalfARB(*dst)[2] = (GLhalfARB(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 2; comp++) {
+            GLfloat aj, ak, bj, bk;
+            aj = _mesa_half_to_float(rowA[j][comp]);
+            ak = _mesa_half_to_float(rowA[k][comp]);
+            bj = _mesa_half_to_float(rowB[j][comp]);
+            bk = _mesa_half_to_float(rowB[k][comp]);
+            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
          }
+      }
    }
    else if (datatype == GL_FLOAT && comps == 1) {
-         GLuint i, j, k;
-         const GLfloat *rowA = (const GLfloat *) srcRowA;
-         const GLfloat *rowB = (const GLfloat *) srcRowB;
-         GLfloat *dst = (GLfloat *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) * 0.25F;
-         }
+      GLuint i, j, k;
+      const GLfloat *rowA = (const GLfloat *) srcRowA;
+      const GLfloat *rowB = (const GLfloat *) srcRowB;
+      GLfloat *dst = (GLfloat *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) * 0.25F;
+      }
    }
    else if (datatype == GL_HALF_FLOAT_ARB && comps == 1) {
-         GLuint i, j, k;
-         const GLhalfARB *rowA = (const GLhalfARB *) srcRowA;
-         const GLhalfARB *rowB = (const GLhalfARB *) srcRowB;
-         GLhalfARB *dst = (GLhalfARB *) dstRow;
-         for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-              i++, j += colStride, k += colStride) {
-            GLfloat aj, ak, bj, bk;
-            aj = _mesa_half_to_float(rowA[j]);
-            ak = _mesa_half_to_float(rowA[k]);
-            bj = _mesa_half_to_float(rowB[j]);
-            bk = _mesa_half_to_float(rowB[k]);
-            dst[i] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-         }
+      GLuint i, j, k;
+      const GLhalfARB *rowA = (const GLhalfARB *) srcRowA;
+      const GLhalfARB *rowB = (const GLhalfARB *) srcRowB;
+      GLhalfARB *dst = (GLhalfARB *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         GLfloat aj, ak, bj, bk;
+         aj = _mesa_half_to_float(rowA[j]);
+         ak = _mesa_half_to_float(rowA[k]);
+         bj = _mesa_half_to_float(rowB[j]);
+         bk = _mesa_half_to_float(rowB[k]);
+         dst[i] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
+      }
    }
    else {
       _mesa_problem(NULL, "bad format in do_row()");
-- 
cgit v1.2.3


From 708a022959104303be554b7c0144dd75fe8d7b81 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 16:42:50 -0700
Subject: reorder cases in do_row()

---
 src/mesa/main/mipmap.c | 341 +++++++++++++++++++++++++------------------------
 1 file changed, 173 insertions(+), 168 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 22c7530e83..db8ab65401 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -234,7 +234,54 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
    assert(srcWidth == dstWidth || srcWidth == 2 * dstWidth);
    */
 
-   if (datatype == GL_UNSIGNED_SHORT && comps == 4) {
+   if (datatype == GL_UNSIGNED_BYTE && comps == 4) {
+      GLuint i, j, k;
+      const GLubyte(*rowA)[4] = (const GLubyte(*)[4]) srcRowA;
+      const GLubyte(*rowB)[4] = (const GLubyte(*)[4]) srcRowB;
+      GLubyte(*dst)[4] = (GLubyte(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] + rowB[j][3] + rowB[k][3]) / 4;
+      }
+   }
+   else if (datatype == GL_UNSIGNED_BYTE && comps == 3) {
+      GLuint i, j, k;
+      const GLubyte(*rowA)[3] = (const GLubyte(*)[3]) srcRowA;
+      const GLubyte(*rowB)[3] = (const GLubyte(*)[3]) srcRowB;
+      GLubyte(*dst)[3] = (GLubyte(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+      }
+   }
+   else if (datatype == GL_UNSIGNED_BYTE && comps == 2) {
+      GLuint i, j, k;
+      const GLubyte(*rowA)[2] = (const GLubyte(*)[2]) srcRowA;
+      const GLubyte(*rowB)[2] = (const GLubyte(*)[2]) srcRowB;
+      GLubyte(*dst)[2] = (GLubyte(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) >> 2;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) >> 2;
+      }
+   }
+   else if (datatype == GL_UNSIGNED_BYTE && comps == 1) {
+      GLuint i, j, k;
+      const GLubyte *rowA = (const GLubyte *) srcRowA;
+      const GLubyte *rowB = (const GLubyte *) srcRowB;
+      GLubyte *dst = (GLubyte *) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) >> 2;
+      }
+   }
+
+   else if (datatype == GL_UNSIGNED_SHORT && comps == 4) {
       GLuint i, j, k;
       const GLushort(*rowA)[4] = (const GLushort(*)[4]) srcRowA;
       const GLushort(*rowB)[4] = (const GLushort(*)[4]) srcRowB;
@@ -259,6 +306,17 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
          dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
       }
    }
+   else if (datatype == GL_UNSIGNED_SHORT && comps == 2) {
+      GLuint i, j, k;
+      const GLushort(*rowA)[2] = (const GLushort(*)[2]) srcRowA;
+      const GLushort(*rowB)[2] = (const GLushort(*)[2]) srcRowB;
+      GLushort(*dst)[2] = (GLushort(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+      }
+   }
    else if (datatype == GL_UNSIGNED_SHORT && comps == 1) {
       GLuint i, j, k;
       const GLushort *rowA = (const GLushort *) srcRowA;
@@ -269,52 +327,141 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
          dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
       }
    }
-   else if (datatype == GL_UNSIGNED_SHORT && comps == 2) {
+
+   else if (datatype == GL_FLOAT && comps == 4) {
       GLuint i, j, k;
-      const GLushort(*rowA)[2] = (const GLushort(*)[2]) srcRowA;
-      const GLushort(*rowB)[2] = (const GLushort(*)[2]) srcRowB;
-      GLushort(*dst)[2] = (GLushort(*)[2]) dstRow;
+      const GLfloat(*rowA)[4] = (const GLfloat(*)[4]) srcRowA;
+      const GLfloat(*rowB)[4] = (const GLfloat(*)[4]) srcRowB;
+      GLfloat(*dst)[4] = (GLfloat(*)[4]) dstRow;
       for (i = j = 0, k = k0; i < (GLuint) dstWidth;
            i++, j += colStride, k += colStride) {
-         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
-         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] +
+                      rowB[j][2] + rowB[k][2]) * 0.25F;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] +
+                      rowB[j][3] + rowB[k][3]) * 0.25F;
       }
    }
-   else if (datatype == GL_UNSIGNED_INT && comps == 1) {
+   else if (datatype == GL_FLOAT && comps == 3) {
       GLuint i, j, k;
-      const GLuint *rowA = (const GLuint *) srcRowA;
-      const GLuint *rowB = (const GLuint *) srcRowB;
+      const GLfloat(*rowA)[3] = (const GLfloat(*)[3]) srcRowA;
+      const GLfloat(*rowB)[3] = (const GLfloat(*)[3]) srcRowB;
+      GLfloat(*dst)[3] = (GLfloat(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] +
+                      rowB[j][2] + rowB[k][2]) * 0.25F;
+      }
+   }
+   else if (datatype == GL_FLOAT && comps == 2) {
+      GLuint i, j, k;
+      const GLfloat(*rowA)[2] = (const GLfloat(*)[2]) srcRowA;
+      const GLfloat(*rowB)[2] = (const GLfloat(*)[2]) srcRowB;
+      GLfloat(*dst)[2] = (GLfloat(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+      }
+   }
+   else if (datatype == GL_FLOAT && comps == 1) {
+      GLuint i, j, k;
+      const GLfloat *rowA = (const GLfloat *) srcRowA;
+      const GLfloat *rowB = (const GLfloat *) srcRowB;
       GLfloat *dst = (GLfloat *) dstRow;
       for (i = j = 0, k = k0; i < (GLuint) dstWidth;
            i++, j += colStride, k += colStride) {
-         dst[i] = rowA[j] / 4 + rowA[k] / 4 + rowB[j] / 4 + rowB[k] / 4;
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) * 0.25F;
+      }
+   }
+
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 4) {
+      GLuint i, j, k, comp;
+      const GLhalfARB(*rowA)[4] = (const GLhalfARB(*)[4]) srcRowA;
+      const GLhalfARB(*rowB)[4] = (const GLhalfARB(*)[4]) srcRowB;
+      GLhalfARB(*dst)[4] = (GLhalfARB(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 4; comp++) {
+            GLfloat aj, ak, bj, bk;
+            aj = _mesa_half_to_float(rowA[j][comp]);
+            ak = _mesa_half_to_float(rowA[k][comp]);
+            bj = _mesa_half_to_float(rowB[j][comp]);
+            bk = _mesa_half_to_float(rowB[k][comp]);
+            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
+         }
+      }
+   }
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 3) {
+      GLuint i, j, k, comp;
+      const GLhalfARB(*rowA)[3] = (const GLhalfARB(*)[3]) srcRowA;
+      const GLhalfARB(*rowB)[3] = (const GLhalfARB(*)[3]) srcRowB;
+      GLhalfARB(*dst)[3] = (GLhalfARB(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 3; comp++) {
+            GLfloat aj, ak, bj, bk;
+            aj = _mesa_half_to_float(rowA[j][comp]);
+            ak = _mesa_half_to_float(rowA[k][comp]);
+            bj = _mesa_half_to_float(rowB[j][comp]);
+            bk = _mesa_half_to_float(rowB[k][comp]);
+            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
+         }
+      }
+   }
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 2) {
+      GLuint i, j, k, comp;
+      const GLhalfARB(*rowA)[2] = (const GLhalfARB(*)[2]) srcRowA;
+      const GLhalfARB(*rowB)[2] = (const GLhalfARB(*)[2]) srcRowB;
+      GLhalfARB(*dst)[2] = (GLhalfARB(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 2; comp++) {
+            GLfloat aj, ak, bj, bk;
+            aj = _mesa_half_to_float(rowA[j][comp]);
+            ak = _mesa_half_to_float(rowA[k][comp]);
+            bj = _mesa_half_to_float(rowB[j][comp]);
+            bk = _mesa_half_to_float(rowB[k][comp]);
+            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
+         }
       }
    }
-   else if (datatype == GL_UNSIGNED_BYTE && comps == 4) {
+   else if (datatype == GL_HALF_FLOAT_ARB && comps == 1) {
       GLuint i, j, k;
-      const GLubyte(*rowA)[4] = (const GLubyte(*)[4]) srcRowA;
-      const GLubyte(*rowB)[4] = (const GLubyte(*)[4]) srcRowB;
-      GLubyte(*dst)[4] = (GLubyte(*)[4]) dstRow;
+      const GLhalfARB *rowA = (const GLhalfARB *) srcRowA;
+      const GLhalfARB *rowB = (const GLhalfARB *) srcRowB;
+      GLhalfARB *dst = (GLhalfARB *) dstRow;
       for (i = j = 0, k = k0; i < (GLuint) dstWidth;
            i++, j += colStride, k += colStride) {
-         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
-         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
-         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
-         dst[i][3] = (rowA[j][3] + rowA[k][3] + rowB[j][3] + rowB[k][3]) / 4;
+         GLfloat aj, ak, bj, bk;
+         aj = _mesa_half_to_float(rowA[j]);
+         ak = _mesa_half_to_float(rowA[k]);
+         bj = _mesa_half_to_float(rowB[j]);
+         bk = _mesa_half_to_float(rowB[k]);
+         dst[i] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
       }
    }
-   else if (datatype == GL_UNSIGNED_BYTE && comps == 3) {
+
+   else if (datatype == GL_UNSIGNED_INT && comps == 1) {
       GLuint i, j, k;
-      const GLubyte(*rowA)[3] = (const GLubyte(*)[3]) srcRowA;
-      const GLubyte(*rowB)[3] = (const GLubyte(*)[3]) srcRowB;
-      GLubyte(*dst)[3] = (GLubyte(*)[3]) dstRow;
+      const GLuint *rowA = (const GLuint *) srcRowA;
+      const GLuint *rowB = (const GLuint *) srcRowB;
+      GLfloat *dst = (GLfloat *) dstRow;
       for (i = j = 0, k = k0; i < (GLuint) dstWidth;
            i++, j += colStride, k += colStride) {
-         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
-         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
-         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+         dst[i] = rowA[j] / 4 + rowA[k] / 4 + rowB[j] / 4 + rowB[k] / 4;
       }
    }
+
    else if (datatype == GL_UNSIGNED_SHORT_5_6_5 && comps == 3) {
       GLuint i, j, k;
       const GLushort *rowA = (const GLushort *) srcRowA;
@@ -400,17 +547,6 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
          dst[i] = (alpha << 15) | (blue << 10) | (green << 5) | red;
       }
    }
-   else if (datatype == GL_UNSIGNED_BYTE && comps == 2) {
-      GLuint i, j, k;
-      const GLubyte(*rowA)[2] = (const GLubyte(*)[2]) srcRowA;
-      const GLubyte(*rowB)[2] = (const GLubyte(*)[2]) srcRowB;
-      GLubyte(*dst)[2] = (GLubyte(*)[2]) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) >> 2;
-         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) >> 2;
-      }
-   }
    else if (datatype == GL_UNSIGNED_BYTE_3_3_2 && comps == 3) {
       GLuint i, j, k;
       const GLubyte *rowA = (const GLubyte *) srcRowA;
@@ -436,137 +572,6 @@ do_row(GLenum datatype, GLuint comps, GLint srcWidth,
          dst[i] = (blue << 5) | (green << 2) | red;
       }
    }
-   else if (datatype == GL_UNSIGNED_BYTE && comps == 1) {
-      GLuint i, j, k;
-      const GLubyte *rowA = (const GLubyte *) srcRowA;
-      const GLubyte *rowB = (const GLubyte *) srcRowB;
-      GLubyte *dst = (GLubyte *) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) >> 2;
-      }
-   }
-   else if (datatype == GL_FLOAT && comps == 4) {
-      GLuint i, j, k;
-      const GLfloat(*rowA)[4] = (const GLfloat(*)[4]) srcRowA;
-      const GLfloat(*rowB)[4] = (const GLfloat(*)[4]) srcRowB;
-      GLfloat(*dst)[4] = (GLfloat(*)[4]) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                      rowB[j][0] + rowB[k][0]) * 0.25F;
-         dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                      rowB[j][1] + rowB[k][1]) * 0.25F;
-         dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                      rowB[j][2] + rowB[k][2]) * 0.25F;
-         dst[i][3] = (rowA[j][3] + rowA[k][3] +
-                      rowB[j][3] + rowB[k][3]) * 0.25F;
-      }
-   }
-   else if (datatype == GL_HALF_FLOAT_ARB && comps == 4) {
-      GLuint i, j, k, comp;
-      const GLhalfARB(*rowA)[4] = (const GLhalfARB(*)[4]) srcRowA;
-      const GLhalfARB(*rowB)[4] = (const GLhalfARB(*)[4]) srcRowB;
-      GLhalfARB(*dst)[4] = (GLhalfARB(*)[4]) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         for (comp = 0; comp < 4; comp++) {
-            GLfloat aj, ak, bj, bk;
-            aj = _mesa_half_to_float(rowA[j][comp]);
-            ak = _mesa_half_to_float(rowA[k][comp]);
-            bj = _mesa_half_to_float(rowB[j][comp]);
-            bk = _mesa_half_to_float(rowB[k][comp]);
-            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-         }
-      }
-   }
-   else if (datatype == GL_FLOAT && comps == 3) {
-      GLuint i, j, k;
-      const GLfloat(*rowA)[3] = (const GLfloat(*)[3]) srcRowA;
-      const GLfloat(*rowB)[3] = (const GLfloat(*)[3]) srcRowB;
-      GLfloat(*dst)[3] = (GLfloat(*)[3]) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                      rowB[j][0] + rowB[k][0]) * 0.25F;
-         dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                      rowB[j][1] + rowB[k][1]) * 0.25F;
-         dst[i][2] = (rowA[j][2] + rowA[k][2] +
-                      rowB[j][2] + rowB[k][2]) * 0.25F;
-      }
-   }
-   else if (datatype == GL_HALF_FLOAT_ARB && comps == 3) {
-      GLuint i, j, k, comp;
-      const GLhalfARB(*rowA)[3] = (const GLhalfARB(*)[3]) srcRowA;
-      const GLhalfARB(*rowB)[3] = (const GLhalfARB(*)[3]) srcRowB;
-      GLhalfARB(*dst)[3] = (GLhalfARB(*)[3]) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         for (comp = 0; comp < 3; comp++) {
-            GLfloat aj, ak, bj, bk;
-            aj = _mesa_half_to_float(rowA[j][comp]);
-            ak = _mesa_half_to_float(rowA[k][comp]);
-            bj = _mesa_half_to_float(rowB[j][comp]);
-            bk = _mesa_half_to_float(rowB[k][comp]);
-            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-         }
-      }
-   }
-   else if (datatype == GL_FLOAT && comps == 2) {
-      GLuint i, j, k;
-      const GLfloat(*rowA)[2] = (const GLfloat(*)[2]) srcRowA;
-      const GLfloat(*rowB)[2] = (const GLfloat(*)[2]) srcRowB;
-      GLfloat(*dst)[2] = (GLfloat(*)[2]) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         dst[i][0] = (rowA[j][0] + rowA[k][0] +
-                      rowB[j][0] + rowB[k][0]) * 0.25F;
-         dst[i][1] = (rowA[j][1] + rowA[k][1] +
-                      rowB[j][1] + rowB[k][1]) * 0.25F;
-      }
-   }
-   else if (datatype == GL_HALF_FLOAT_ARB && comps == 2) {
-      GLuint i, j, k, comp;
-      const GLhalfARB(*rowA)[2] = (const GLhalfARB(*)[2]) srcRowA;
-      const GLhalfARB(*rowB)[2] = (const GLhalfARB(*)[2]) srcRowB;
-      GLhalfARB(*dst)[2] = (GLhalfARB(*)[2]) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         for (comp = 0; comp < 2; comp++) {
-            GLfloat aj, ak, bj, bk;
-            aj = _mesa_half_to_float(rowA[j][comp]);
-            ak = _mesa_half_to_float(rowA[k][comp]);
-            bj = _mesa_half_to_float(rowB[j][comp]);
-            bk = _mesa_half_to_float(rowB[k][comp]);
-            dst[i][comp] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-         }
-      }
-   }
-   else if (datatype == GL_FLOAT && comps == 1) {
-      GLuint i, j, k;
-      const GLfloat *rowA = (const GLfloat *) srcRowA;
-      const GLfloat *rowB = (const GLfloat *) srcRowB;
-      GLfloat *dst = (GLfloat *) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) * 0.25F;
-      }
-   }
-   else if (datatype == GL_HALF_FLOAT_ARB && comps == 1) {
-      GLuint i, j, k;
-      const GLhalfARB *rowA = (const GLhalfARB *) srcRowA;
-      const GLhalfARB *rowB = (const GLhalfARB *) srcRowB;
-      GLhalfARB *dst = (GLhalfARB *) dstRow;
-      for (i = j = 0, k = k0; i < (GLuint) dstWidth;
-           i++, j += colStride, k += colStride) {
-         GLfloat aj, ak, bj, bk;
-         aj = _mesa_half_to_float(rowA[j]);
-         ak = _mesa_half_to_float(rowA[k]);
-         bj = _mesa_half_to_float(rowB[j]);
-         bk = _mesa_half_to_float(rowB[k]);
-         dst[i] = _mesa_float_to_half((aj + ak + bj + bk) * 0.25F);
-      }
-   }
    else {
       _mesa_problem(NULL, "bad format in do_row()");
    }
-- 
cgit v1.2.3


From a217d0c7d0d52871dbf4196f83fecefa87ef8a24 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 16:46:12 -0700
Subject: move _mesa_format_to_type_and_comps() to texformat.c

---
 src/mesa/main/mipmap.c    | 168 +--------------------------------------------
 src/mesa/main/texformat.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++
 src/mesa/main/texformat.h |   6 ++
 3 files changed, 178 insertions(+), 167 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index db8ab65401..981da5dd89 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -45,172 +45,6 @@ bytes_per_pixel(GLenum datatype, GLuint comps)
 }
 
 
-static void
-mesa_format_to_type_and_comps(const struct gl_texture_format *format,
-                              GLenum *datatype, GLuint *comps)
-{
-   switch (format->MesaFormat) {
-   case MESA_FORMAT_RGBA8888:
-   case MESA_FORMAT_RGBA8888_REV:
-   case MESA_FORMAT_ARGB8888:
-   case MESA_FORMAT_ARGB8888_REV:
-      *datatype = CHAN_TYPE;
-      *comps = 4;
-      return;
-   case MESA_FORMAT_RGB888:
-   case MESA_FORMAT_BGR888:
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 3;
-      return;
-   case MESA_FORMAT_RGB565:
-   case MESA_FORMAT_RGB565_REV:
-      *datatype = GL_UNSIGNED_SHORT_5_6_5;
-      *comps = 3;
-      return;
-
-   case MESA_FORMAT_ARGB4444:
-   case MESA_FORMAT_ARGB4444_REV:
-      *datatype = GL_UNSIGNED_SHORT_4_4_4_4;
-      *comps = 4;
-      return;
-
-   case MESA_FORMAT_ARGB1555:
-   case MESA_FORMAT_ARGB1555_REV:
-      *datatype = GL_UNSIGNED_SHORT_1_5_5_5_REV;
-      *comps = 3;
-      return;
-
-   case MESA_FORMAT_AL88:
-   case MESA_FORMAT_AL88_REV:
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 2;
-      return;
-   case MESA_FORMAT_RGB332:
-      *datatype = GL_UNSIGNED_BYTE_3_3_2;
-      *comps = 3;
-      return;
-
-   case MESA_FORMAT_A8:
-   case MESA_FORMAT_L8:
-   case MESA_FORMAT_I8:
-   case MESA_FORMAT_CI8:
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 1;
-      return;
-
-   case MESA_FORMAT_YCBCR:
-   case MESA_FORMAT_YCBCR_REV:
-      *datatype = GL_UNSIGNED_SHORT;
-      *comps = 2;
-      return;
-
-   case MESA_FORMAT_Z24_S8:
-      *datatype = GL_UNSIGNED_INT;
-      *comps = 1; /* XXX OK? */
-      return;
-
-   case MESA_FORMAT_Z16:
-      *datatype = GL_UNSIGNED_SHORT;
-      *comps = 1;
-      return;
-
-   case MESA_FORMAT_Z32:
-      *datatype = GL_UNSIGNED_INT;
-      *comps = 1;
-      return;
-
-   case MESA_FORMAT_SRGB8:
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 3;
-      return;
-   case MESA_FORMAT_SRGBA8:
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 4;
-      return;
-   case MESA_FORMAT_SL8:
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 1;
-      return;
-   case MESA_FORMAT_SLA8:
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 2;
-      return;
-
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
-   case MESA_FORMAT_RGB_DXT1:
-   case MESA_FORMAT_RGBA_DXT1:
-   case MESA_FORMAT_RGBA_DXT3:
-   case MESA_FORMAT_RGBA_DXT5:
-      /* XXX generate error instead? */
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 0;
-      return;
-
-   case MESA_FORMAT_RGBA:
-      *datatype = CHAN_TYPE;
-      *comps = 4;
-      return;
-   case MESA_FORMAT_RGB:
-      *datatype = CHAN_TYPE;
-      *comps = 3;
-      return;
-   case MESA_FORMAT_LUMINANCE_ALPHA:
-      *datatype = CHAN_TYPE;
-      *comps = 2;
-      return;
-   case MESA_FORMAT_ALPHA:
-   case MESA_FORMAT_LUMINANCE:
-   case MESA_FORMAT_INTENSITY:
-      *datatype = CHAN_TYPE;
-      *comps = 1;
-      return;
-
-   case MESA_FORMAT_RGBA_FLOAT32:
-      *datatype = GL_FLOAT;
-      *comps = 4;
-      return;
-   case MESA_FORMAT_RGBA_FLOAT16:
-      *datatype = GL_HALF_FLOAT_ARB;
-      *comps = 4;
-      return;
-   case MESA_FORMAT_RGB_FLOAT32:
-      *datatype = GL_FLOAT;
-      *comps = 3;
-      return;
-   case MESA_FORMAT_RGB_FLOAT16:
-      *datatype = GL_HALF_FLOAT_ARB;
-      *comps = 3;
-      return;
-   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
-      *datatype = GL_FLOAT;
-      *comps = 2;
-      return;
-   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
-      *datatype = GL_HALF_FLOAT_ARB;
-      *comps = 2;
-      return;
-   case MESA_FORMAT_ALPHA_FLOAT32:
-   case MESA_FORMAT_LUMINANCE_FLOAT32:
-   case MESA_FORMAT_INTENSITY_FLOAT32:
-      *datatype = GL_FLOAT;
-      *comps = 1;
-      return;
-   case MESA_FORMAT_ALPHA_FLOAT16:
-   case MESA_FORMAT_LUMINANCE_FLOAT16:
-   case MESA_FORMAT_INTENSITY_FLOAT16:
-      *datatype = GL_HALF_FLOAT_ARB;
-      *comps = 1;
-      return;
-
-   default:
-      _mesa_problem(NULL, "bad texture format in mesa_format_to_type_and_comps");
-      *datatype = 0;
-      *comps = 1;
-   }
-}
-
-
 /**
  * Average together two rows of a source image to produce a single new
  * row in the dest image.  It's legal for the two source rows to point
@@ -1085,7 +919,7 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
       convertFormat = srcImage->TexFormat;
    }
 
-   mesa_format_to_type_and_comps(convertFormat, &datatype, &comps);
+   _mesa_format_to_type_and_comps(convertFormat, &datatype, &comps);
 
    for (level = texObj->BaseLevel; level < texObj->MaxLevel
            && level < maxLevels - 1; level++) {
diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index acc268e622..88fbd8f07c 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -1569,3 +1569,174 @@ _mesa_choose_tex_format( GLcontext *ctx, GLint internalFormat,
    _mesa_problem(ctx, "unexpected format in _mesa_choose_tex_format()");
    return NULL;
 }
+
+
+
+/**
+ * Return datatype and number of components per texel for the
+ * given gl_texture_format.
+ */
+void
+_mesa_format_to_type_and_comps(const struct gl_texture_format *format,
+                               GLenum *datatype, GLuint *comps)
+{
+   switch (format->MesaFormat) {
+   case MESA_FORMAT_RGBA8888:
+   case MESA_FORMAT_RGBA8888_REV:
+   case MESA_FORMAT_ARGB8888:
+   case MESA_FORMAT_ARGB8888_REV:
+      *datatype = CHAN_TYPE;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGB888:
+   case MESA_FORMAT_BGR888:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_RGB565:
+   case MESA_FORMAT_RGB565_REV:
+      *datatype = GL_UNSIGNED_SHORT_5_6_5;
+      *comps = 3;
+      return;
+
+   case MESA_FORMAT_ARGB4444:
+   case MESA_FORMAT_ARGB4444_REV:
+      *datatype = GL_UNSIGNED_SHORT_4_4_4_4;
+      *comps = 4;
+      return;
+
+   case MESA_FORMAT_ARGB1555:
+   case MESA_FORMAT_ARGB1555_REV:
+      *datatype = GL_UNSIGNED_SHORT_1_5_5_5_REV;
+      *comps = 3;
+      return;
+
+   case MESA_FORMAT_AL88:
+   case MESA_FORMAT_AL88_REV:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_RGB332:
+      *datatype = GL_UNSIGNED_BYTE_3_3_2;
+      *comps = 3;
+      return;
+
+   case MESA_FORMAT_A8:
+   case MESA_FORMAT_L8:
+   case MESA_FORMAT_I8:
+   case MESA_FORMAT_CI8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_YCBCR:
+   case MESA_FORMAT_YCBCR_REV:
+      *datatype = GL_UNSIGNED_SHORT;
+      *comps = 2;
+      return;
+
+   case MESA_FORMAT_Z24_S8:
+      *datatype = GL_UNSIGNED_INT;
+      *comps = 1; /* XXX OK? */
+      return;
+
+   case MESA_FORMAT_Z16:
+      *datatype = GL_UNSIGNED_SHORT;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_Z32:
+      *datatype = GL_UNSIGNED_INT;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_SRGB8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_SRGBA8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_SL8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 1;
+      return;
+   case MESA_FORMAT_SLA8:
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 2;
+      return;
+
+   case MESA_FORMAT_RGB_FXT1:
+   case MESA_FORMAT_RGBA_FXT1:
+   case MESA_FORMAT_RGB_DXT1:
+   case MESA_FORMAT_RGBA_DXT1:
+   case MESA_FORMAT_RGBA_DXT3:
+   case MESA_FORMAT_RGBA_DXT5:
+      /* XXX generate error instead? */
+      *datatype = GL_UNSIGNED_BYTE;
+      *comps = 0;
+      return;
+
+   case MESA_FORMAT_RGBA:
+      *datatype = CHAN_TYPE;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGB:
+      *datatype = CHAN_TYPE;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_LUMINANCE_ALPHA:
+      *datatype = CHAN_TYPE;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_ALPHA:
+   case MESA_FORMAT_LUMINANCE:
+   case MESA_FORMAT_INTENSITY:
+      *datatype = CHAN_TYPE;
+      *comps = 1;
+      return;
+
+   case MESA_FORMAT_RGBA_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGBA_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 4;
+      return;
+   case MESA_FORMAT_RGB_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_RGB_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 3;
+      return;
+   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 2;
+      return;
+   case MESA_FORMAT_ALPHA_FLOAT32:
+   case MESA_FORMAT_LUMINANCE_FLOAT32:
+   case MESA_FORMAT_INTENSITY_FLOAT32:
+      *datatype = GL_FLOAT;
+      *comps = 1;
+      return;
+   case MESA_FORMAT_ALPHA_FLOAT16:
+   case MESA_FORMAT_LUMINANCE_FLOAT16:
+   case MESA_FORMAT_INTENSITY_FLOAT16:
+      *datatype = GL_HALF_FLOAT_ARB;
+      *comps = 1;
+      return;
+
+   default:
+      _mesa_problem(NULL, "bad format in _mesa_format_to_type_and_comps");
+      *datatype = 0;
+      *comps = 1;
+   }
+}
diff --git a/src/mesa/main/texformat.h b/src/mesa/main/texformat.h
index 55851db701..48f0fe99f2 100644
--- a/src/mesa/main/texformat.h
+++ b/src/mesa/main/texformat.h
@@ -239,4 +239,10 @@ extern const struct gl_texture_format *
 _mesa_choose_tex_format( GLcontext *ctx, GLint internalFormat,
                          GLenum format, GLenum type );
 
+
+extern void
+_mesa_format_to_type_and_comps(const struct gl_texture_format *format,
+                               GLenum *datatype, GLuint *comps);
+
+
 #endif
-- 
cgit v1.2.3


From b81ef1c429bbd34536f30c1522f1915996476078 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 16:56:21 -0700
Subject: refactor code, export _mesa_generate_mipmap_level()

---
 src/mesa/main/mipmap.c | 98 ++++++++++++++++++++++++++++----------------------
 src/mesa/main/mipmap.h | 11 ++++++
 2 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 981da5dd89..ed7795aef9 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -841,6 +841,59 @@ make_2d_stack_mipmap(GLenum datatype, GLuint comps, GLint border,
 }
 
 
+/**
+ * Down-sample a texture image to produce the next lower mipmap level.
+ */
+void
+_mesa_generate_mipmap_level(GLenum target,
+                            GLenum datatype, GLuint comps,
+                            GLint border,
+                            GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                            const GLubyte *srcData,
+                            GLint dstWidth, GLint dstHeight, GLint dstDepth,
+                            GLubyte *dstData)
+{
+   switch (target) {
+   case GL_TEXTURE_1D:
+      make_1d_mipmap(datatype, comps, border,
+                     srcWidth, srcData,
+                     dstWidth, dstData);
+      break;
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X_ARB:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y_ARB:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_ARB:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB:
+      make_2d_mipmap(datatype, comps, border,
+                     srcWidth, srcHeight, srcData,
+                     dstWidth, dstHeight, dstData);
+      break;
+   case GL_TEXTURE_3D:
+      make_3d_mipmap(datatype, comps, border,
+                     srcWidth, srcHeight, srcDepth, srcData,
+                     dstWidth, dstHeight, dstDepth, dstData);
+      break;
+   case GL_TEXTURE_1D_ARRAY_EXT:
+      make_1d_stack_mipmap(datatype, comps, border,
+                           srcWidth, srcData,
+                           dstWidth, dstHeight, dstData);
+      break;
+   case GL_TEXTURE_2D_ARRAY_EXT:
+      make_2d_stack_mipmap(datatype, comps, border,
+                           srcWidth, srcHeight, srcData,
+                           dstWidth, dstHeight, dstDepth, dstData);
+      break;
+   case GL_TEXTURE_RECTANGLE_NV:
+      /* no mipmaps, do nothing */
+      break;
+   default:
+      _mesa_problem(NULL, "bad target in _mesa_generate_mipmap_level");
+   }
+}
+
+
 /**
  * For GL_SGIX_generate_mipmap:
  * Generate a complete set of mipmaps from texObj's base-level image.
@@ -1032,48 +1085,9 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
          dstData = (GLubyte *) dstImage->Data;
       }
 
-      /*
-       * We use simple 2x2 averaging to compute the next mipmap level.
-       */
-      switch (target) {
-         case GL_TEXTURE_1D:
-            make_1d_mipmap(datatype, comps, border,
-                           srcWidth, srcData,
-                           dstWidth, dstData);
-            break;
-         case GL_TEXTURE_2D:
-         case GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB:
-         case GL_TEXTURE_CUBE_MAP_NEGATIVE_X_ARB:
-         case GL_TEXTURE_CUBE_MAP_POSITIVE_Y_ARB:
-         case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_ARB:
-         case GL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB:
-         case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB:
-            make_2d_mipmap(datatype, comps, border,
-                           srcWidth, srcHeight, srcData,
-                           dstWidth, dstHeight, dstData);
-            break;
-         case GL_TEXTURE_3D:
-            make_3d_mipmap(datatype, comps, border,
-                           srcWidth, srcHeight, srcDepth, srcData,
-                           dstWidth, dstHeight, dstDepth, dstData);
-            break;
-         case GL_TEXTURE_1D_ARRAY_EXT:
-            make_1d_stack_mipmap(datatype, comps, border,
-                                 srcWidth, srcData,
-                                 dstWidth, dstHeight, dstData);
-            break;
-         case GL_TEXTURE_2D_ARRAY_EXT:
-            make_2d_stack_mipmap(datatype, comps, border,
-                                 srcWidth, srcHeight, srcData,
-                                 dstWidth, dstHeight, dstDepth, dstData);
-            break;
-         case GL_TEXTURE_RECTANGLE_NV:
-            /* no mipmaps, do nothing */
-            break;
-         default:
-            _mesa_problem(ctx, "bad dimensions in _mesa_generate_mipmaps");
-            return;
-      }
+      _mesa_generate_mipmap_level(target, datatype, comps, border,
+                                  srcWidth, srcHeight, srcDepth, srcData,
+                                  dstWidth, dstHeight, dstDepth, dstData);
 
       if (dstImage->IsCompressed) {
          GLubyte *temp;
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index 46e16902c8..b6491f5507 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -28,6 +28,17 @@
 
 #include "mtypes.h"
 
+
+extern void
+_mesa_generate_mipmap_level(GLenum target,
+                            GLenum datatype, GLuint comps,
+                            GLint border,
+                            GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                            const GLubyte *srcData,
+                            GLint dstWidth, GLint dstHeight, GLint dstDepth,
+                            GLubyte *dstData);
+
+
 extern void
 _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
                       struct gl_texture_object *texObj);
-- 
cgit v1.2.3


From 1c7d7da30c9c99f663b9c29636e2854e0bf6af4e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 08:18:43 -0700
Subject: gallium: fix fragment/vertex typo

---
 src/mesa/pipe/draw/draw_vertex_shader.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 574ac67057..9c31df1e3e 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -304,7 +304,7 @@ draw_bind_vertex_shader(struct draw_context *draw,
    draw->vertex_shader = dvs;
    draw->num_vs_outputs = dvs->state->num_outputs;
 
-   /* specify the fragment program to interpret/execute */
+   /* specify the vertex program to interpret/execute */
    tgsi_exec_machine_init(&draw->machine,
                           draw->vertex_shader->state->tokens,
                           PIPE_MAX_SAMPLERS,
-- 
cgit v1.2.3


From 23e6d1aebc4c667a24243c89ffa836bc1fc74252 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 08:56:18 -0700
Subject: gallium: update comment about buffer map flags

---
 src/mesa/pipe/p_winsys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/pipe/p_winsys.h b/src/mesa/pipe/p_winsys.h
index 95e3684008..1e81eebd78 100644
--- a/src/mesa/pipe/p_winsys.h
+++ b/src/mesa/pipe/p_winsys.h
@@ -112,7 +112,7 @@ struct pipe_winsys
 
    /** 
     * Map the entire data store of a buffer object into the client's address.
-    * flags is bitmask of PIPE_BUFFER_FLAG_READ/WRITE. 
+    * flags is bitmask of PIPE_BUFFER_USAGE_CPU_READ/WRITE flags. 
     */
    void *(*buffer_map)( struct pipe_winsys *sws, 
 			struct pipe_buffer *buf,
-- 
cgit v1.2.3


From 939aa5d3bdc4728e6b848c0bbf150f8e644d5e1b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:33:28 -0700
Subject: added _mesa_tex_target_to_face()

---
 src/mesa/main/teximage.c | 20 +++++++++++++-------
 src/mesa/main/teximage.h |  5 +++++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 09ec0d4553..5c96be9216 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -595,8 +595,12 @@ is_compressed_format(GLcontext *ctx, GLenum internalFormat)
 }
 
 
-static GLuint
-texture_face(GLenum target)
+/**
+ * For cube map faces, return a face index in [0,5].
+ * For other targets return 0;
+ */
+GLuint
+_mesa_tex_target_to_face(GLenum target)
 {
    if (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
        target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB)
@@ -625,6 +629,7 @@ _mesa_set_tex_image(struct gl_texture_object *tObj,
 {
    ASSERT(tObj);
    ASSERT(texImage);
+   /* XXX simplify this with _mesa_tex_target_to_face() */
    switch (target) {
       case GL_TEXTURE_1D:
       case GL_TEXTURE_2D:
@@ -828,6 +833,7 @@ _mesa_select_tex_image(GLcontext *ctx, const struct gl_texture_object *texObj,
    if (level < 0 || level >= MAX_TEXTURE_LEVELS) 
       return NULL;
 
+   /* XXX simplify this with _mesa_tex_target_to_face() */
    switch (target) {
       case GL_TEXTURE_1D:
       case GL_PROXY_TEXTURE_1D:
@@ -2424,7 +2430,7 @@ _mesa_TexImage1D( GLenum target, GLint level, GLint internalFormat,
       struct gl_texture_unit *texUnit;
       struct gl_texture_object *texObj;
       struct gl_texture_image *texImage;
-      const GLuint face = texture_face(target);
+      const GLuint face = _mesa_tex_target_to_face(target);
 
       if (texture_error_check(ctx, target, level, internalFormat,
                               format, type, 1, postConvWidth, 1, 1, border)) {
@@ -2527,7 +2533,7 @@ _mesa_TexImage2D( GLenum target, GLint level, GLint internalFormat,
       struct gl_texture_unit *texUnit;
       struct gl_texture_object *texObj;
       struct gl_texture_image *texImage;
-      const GLuint face = texture_face(target);
+      const GLuint face = _mesa_tex_target_to_face(target);
 
       if (texture_error_check(ctx, target, level, internalFormat,
                               format, type, 2, postConvWidth, postConvHeight,
@@ -2629,7 +2635,7 @@ _mesa_TexImage3D( GLenum target, GLint level, GLint internalFormat,
       struct gl_texture_unit *texUnit;
       struct gl_texture_object *texObj;
       struct gl_texture_image *texImage;
-      const GLuint face = texture_face(target);
+      const GLuint face = _mesa_tex_target_to_face(target);
 
       if (texture_error_check(ctx, target, level, (GLint) internalFormat,
                               format, type, 3, width, height, depth, border)) {
@@ -2897,7 +2903,7 @@ _mesa_CopyTexImage1D( GLenum target, GLint level,
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
    GLsizei postConvWidth = width;
-   const GLuint face = texture_face(target);
+   const GLuint face = _mesa_tex_target_to_face(target);
    GET_CURRENT_CONTEXT(ctx);
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
 
@@ -2960,7 +2966,7 @@ _mesa_CopyTexImage2D( GLenum target, GLint level, GLenum internalFormat,
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
    GLsizei postConvWidth = width, postConvHeight = height;
-   const GLuint face = texture_face(target);
+   const GLuint face = _mesa_tex_target_to_face(target);
    GET_CURRENT_CONTEXT(ctx);
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
 
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index f2cad7eb2d..b718c0046d 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -107,6 +107,11 @@ _mesa_test_proxy_teximage(GLcontext *ctx, GLenum target, GLint level,
                          GLint width, GLint height, GLint depth, GLint border);
 
 
+extern GLuint
+_mesa_tex_target_to_face(GLenum target);
+
+
+
 /**
  * Lock a texture for updating.  See also _mesa_lock_context_textures().
  */
-- 
cgit v1.2.3


From ba499584d624687e91c6436f8ea539ae77173cd4 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:35:28 -0700
Subject: gallium: use _mesa_tex_target_to_face()

---
 src/mesa/state_tracker/st_cb_texture.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 3350254654..992723afba 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -379,26 +379,6 @@ guess_and_alloc_texture(struct st_context *st,
 }
 
 
-
-
-static GLuint
-target_to_face(GLenum target)
-{
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X_ARB:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y_ARB:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_ARB:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB:
-      return ((GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X);
-   default:
-      return 0;
-   }
-}
-
-
-
 /* There are actually quite a few combinations this will work for,
  * more than what I've listed here.
  */
@@ -512,7 +492,7 @@ st_TexImage(GLcontext * ctx,
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
 
-   stImage->face = target_to_face(target);
+   stImage->face = _mesa_tex_target_to_face(target);
    stImage->level = level;
 
    if (ctx->_ImageTransferState & IMAGE_CONVOLUTION_BIT) {
-- 
cgit v1.2.3


From 59f0ce86b8696c6dadfaab3099ed2193b411e8d0 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:38:36 -0700
Subject: gallium: strip borders from textures passed to st_TexImage.

Manipulate the unpack params to skip the border.  Gallium drivers won't support
texture borders.
---
 src/mesa/state_tracker/st_cb_texture.c | 53 ++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 992723afba..7099ec33b9 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -467,6 +467,43 @@ try_pbo_upload(GLcontext *ctx,
 }
 
 
+/**
+ * Adjust pixel unpack params and image dimensions to strip off the
+ * texture border.
+ * Gallium doesn't support texture borders.  They've seldem been used
+ * and seldom been implemented correctly anyway.
+ * \param unpackNew  returns the new pixel unpack parameters
+ */
+static void
+strip_texture_border(GLint border,
+                     GLint *width, GLint *height, GLint *depth,
+                     const struct gl_pixelstore_attrib *unpack,
+                     struct gl_pixelstore_attrib *unpackNew)
+{
+   assert(border > 0);  /* sanity check */
+
+   *unpackNew = *unpack;
+
+   if (unpackNew->RowLength == 0)
+      unpackNew->RowLength = *width;
+
+   if (depth && unpackNew->ImageHeight == 0)
+      unpackNew->ImageHeight = *height;
+
+   unpackNew->SkipPixels += border;
+   if (height)
+      unpackNew->SkipRows += border;
+   if (depth)
+      unpackNew->SkipImages += border;
+
+   assert(*width >= 3);
+   *width = *width - 2 * border;
+   if (height && *height >= 3)
+      *height = *height - 2 * border;
+   if (depth && *depth >= 3)
+      *depth = *depth - 2 * border;
+}
+
 
 static void
 st_TexImage(GLcontext * ctx,
@@ -483,15 +520,25 @@ st_TexImage(GLcontext * ctx,
 {
    struct st_texture_object *stObj = st_texture_object(texObj);
    struct st_texture_image *stImage = st_texture_image(texImage);
-   GLint postConvWidth = width;
-   GLint postConvHeight = height;
+   GLint postConvWidth, postConvHeight;
    GLint texelBytes, sizeInBytes;
    GLuint dstRowStride;
-
+   struct gl_pixelstore_attrib unpackNB;
 
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
 
+   /* gallium does not support texture borders, strip it off */
+   if (border) {
+      strip_texture_border(border, &width, &height, &depth,
+                           unpack, &unpackNB);
+      unpack = &unpackNB;
+      border = 0;
+   }
+
+   postConvWidth = width;
+   postConvHeight = height;
+
    stImage->face = _mesa_tex_target_to_face(target);
    stImage->level = level;
 
-- 
cgit v1.2.3


From 995924d566e3a5c06ee4728b846c18de39574966 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:42:02 -0700
Subject: gallium: implement software fallback for mipmap generation

This is used when we can't render to the surface type of the texture (such
as luminance/alpha).
---
 src/mesa/state_tracker/st_gen_mipmap.c | 68 ++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index a6ac9a55fb..5c00392af7 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -29,11 +29,14 @@
 #include "main/imports.h"
 #include "main/mipmap.h"
 #include "main/teximage.h"
+#include "main/texformat.h"
 
 #include "shader/prog_instruction.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
 #include "pipe/cso_cache/cso_cache.h"
 
 #include "st_context.h"
@@ -239,15 +242,18 @@ draw_quad(GLcontext *ctx)
  */
 static boolean
 st_render_mipmap(struct st_context *st,
+                 GLenum target,
                  struct pipe_texture *pt,
                  uint baseLevel, uint lastLevel)
 {
    struct pipe_context *pipe = st->pipe;
    struct pipe_framebuffer_state fb;
-   const uint face = 0, zslice = 0;
+   const uint face = _mesa_tex_target_to_face(target), zslice = 0;
    const uint first_level_save = pt->first_level;
    uint dstLevel;
 
+   assert(target != GL_TEXTURE_3D); /* not done yet */
+
    /* check if we can render in the texture's format */
    if (!pipe->is_format_supported(pipe, pt->format, PIPE_SURFACE)) {
       return FALSE;
@@ -307,6 +313,56 @@ st_render_mipmap(struct st_context *st,
 }
 
 
+static void
+fallback_generate_mipmap(GLcontext *ctx, GLenum target,
+                         struct gl_texture_object *texObj)
+{
+   struct pipe_context *pipe = ctx->st->pipe;
+   struct pipe_winsys *ws = pipe->winsys;
+   struct pipe_texture *pt = st_get_texobj_texture(texObj);
+   const uint baseLevel = texObj->BaseLevel;
+   const uint lastLevel = pt->last_level;
+   const uint face = _mesa_tex_target_to_face(target), zslice = 0;
+   uint dstLevel;
+   GLenum datatype;
+   GLuint comps;
+
+   assert(target != GL_TEXTURE_3D); /* not done yet */
+
+   _mesa_format_to_type_and_comps(texObj->Image[face][baseLevel]->TexFormat,
+                                  &datatype, &comps);
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      struct pipe_surface *srcSurf, *dstSurf;
+      const ubyte *srcData;
+      ubyte *dstData;
+
+      srcSurf = pipe->get_tex_surface(pipe, pt, face, srcLevel, zslice);
+      dstSurf = pipe->get_tex_surface(pipe, pt, face, dstLevel, zslice);
+
+      srcData = (ubyte *) ws->buffer_map(ws, srcSurf->buffer,
+                                         PIPE_BUFFER_USAGE_CPU_READ)
+              + srcSurf->offset;
+      dstData = (ubyte *) ws->buffer_map(ws, dstSurf->buffer,
+                                         PIPE_BUFFER_USAGE_CPU_WRITE)
+              + dstSurf->offset;
+
+      _mesa_generate_mipmap_level(target, datatype, comps,
+                   0 /*border*/,
+                   pt->width[srcLevel], pt->height[srcLevel], pt->depth[srcLevel],
+                   srcData,
+                   pt->width[dstLevel], pt->height[dstLevel], pt->depth[dstLevel],
+                   dstData);
+
+      ws->buffer_unmap(ws, srcSurf->buffer);
+      ws->buffer_unmap(ws, dstSurf->buffer);
+
+      pipe_surface_reference(&srcSurf, NULL);
+      pipe_surface_reference(&dstSurf, NULL);
+   }
+}
+
 
 void
 st_generate_mipmap(GLcontext *ctx, GLenum target,
@@ -318,13 +374,11 @@ st_generate_mipmap(GLcontext *ctx, GLenum target,
    const uint lastLevel = pt->last_level;
    uint dstLevel;
 
-   if (!st_render_mipmap(st, pt, baseLevel, lastLevel)) {
-      abort();
-      /* XXX the following won't really work at this time */
-      _mesa_generate_mipmap(ctx, target, texObj);
-      return;
+   if (!st_render_mipmap(st, target, pt, baseLevel, lastLevel)) {
+      fallback_generate_mipmap(ctx, target, texObj);
    }
 
+   /* Fill in the Mesa gl_texture_image fields */
    for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
       const uint srcLevel = dstLevel - 1;
       const struct gl_texture_image *srcImage
@@ -336,7 +390,6 @@ st_generate_mipmap(GLcontext *ctx, GLenum target,
       uint dstDepth = pt->depth[dstLevel];
       uint border = srcImage->Border;
 
-
       dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
       if (!dstImage) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
@@ -359,5 +412,4 @@ st_generate_mipmap(GLcontext *ctx, GLenum target,
       stImage = (struct st_texture_image *) dstImage;
       stImage->pt = pt;
    }
-
 }
-- 
cgit v1.2.3


From a705e157f2d14a7fcc81b292fcca9dab4f38c9b2 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:43:30 -0700
Subject: code refactoring, new next_mipmap_level_size() function

---
 src/mesa/main/mipmap.c | 75 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 25 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index ed7795aef9..721a227bb5 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -894,6 +894,51 @@ _mesa_generate_mipmap_level(GLenum target,
 }
 
 
+/**
+ * compute next (level+1) image size
+ * \return GL_FALSE if no smaller size can be generated (eg. src is 1x1x1 size)
+ */
+static GLboolean
+next_mipmap_level_size(GLenum target, GLint border,
+                       GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                       GLint *dstWidth, GLint *dstHeight, GLint *dstDepth)
+{
+   if (srcWidth - 2 * border > 1) {
+      *dstWidth = (srcWidth - 2 * border) / 2 + 2 * border;
+   }
+   else {
+      *dstWidth = srcWidth; /* can't go smaller */
+   }
+
+   if ((srcHeight - 2 * border > 1) && 
+       (target != GL_TEXTURE_1D_ARRAY_EXT)) {
+      *dstHeight = (srcHeight - 2 * border) / 2 + 2 * border;
+   }
+   else {
+      *dstHeight = srcHeight; /* can't go smaller */
+   }
+
+   if ((srcDepth - 2 * border > 1) &&
+       (target != GL_TEXTURE_2D_ARRAY_EXT)) {
+      *dstDepth = (srcDepth - 2 * border) / 2 + 2 * border;
+   }
+   else {
+      *dstDepth = srcDepth; /* can't go smaller */
+   }
+
+   if (*dstWidth == srcWidth &&
+       *dstHeight == srcHeight &&
+       *dstDepth == srcDepth) {
+      return GL_FALSE;
+   }
+   else {
+      return GL_TRUE;
+   }
+}
+
+
+
+
 /**
  * For GL_SGIX_generate_mipmap:
  * Generate a complete set of mipmaps from texObj's base-level image.
@@ -982,6 +1027,7 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
       GLint srcWidth, srcHeight, srcDepth;
       GLint dstWidth, dstHeight, dstDepth;
       GLint border, bytesPerTexel;
+      GLboolean nextLevel;
 
       /* get src image parameters */
       srcImage = _mesa_select_tex_image(ctx, texObj, target, level);
@@ -991,31 +1037,10 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
       srcDepth = srcImage->Depth;
       border = srcImage->Border;
 
-      /* compute next (level+1) image size */
-      if (srcWidth - 2 * border > 1) {
-         dstWidth = (srcWidth - 2 * border) / 2 + 2 * border;
-      }
-      else {
-         dstWidth = srcWidth; /* can't go smaller */
-      }
-      if ((srcHeight - 2 * border > 1) && 
-          (texObj->Target != GL_TEXTURE_1D_ARRAY_EXT)) {
-         dstHeight = (srcHeight - 2 * border) / 2 + 2 * border;
-      }
-      else {
-         dstHeight = srcHeight; /* can't go smaller */
-      }
-      if ((srcDepth - 2 * border > 1) &&
-               (texObj->Target != GL_TEXTURE_2D_ARRAY_EXT)) {
-         dstDepth = (srcDepth - 2 * border) / 2 + 2 * border;
-      }
-      else {
-         dstDepth = srcDepth; /* can't go smaller */
-      }
-
-      if (dstWidth == srcWidth &&
-          dstHeight == srcHeight &&
-          dstDepth == srcDepth) {
+      nextLevel = next_mipmap_level_size(target, border,
+                                         srcWidth, srcHeight, srcDepth,
+                                         &dstWidth, &dstHeight, &dstDepth);
+      if (!nextLevel) {
          /* all done */
          if (srcImage->IsCompressed) {
             _mesa_free((void *) srcData);
-- 
cgit v1.2.3


From 48c4a1ed12d30932c5a9d09424213a830efe2ef9 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:50:21 -0700
Subject: gallium: comments about mipmap gen

---
 src/mesa/state_tracker/st_gen_mipmap.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index 5c00392af7..f6af37cfac 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -273,6 +273,10 @@ st_render_mipmap(struct st_context *st,
    pipe->bind_fs_state(pipe, stfp->fs->data);
    pipe->bind_vs_state(pipe, stvp->cso->data);
 
+   /*
+    * XXX for small mipmap levels, it may be faster to use the software
+    * fallback path...
+    */
    for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
       const uint srcLevel = dstLevel - 1;
 
@@ -348,6 +352,7 @@ fallback_generate_mipmap(GLcontext *ctx, GLenum target,
                                          PIPE_BUFFER_USAGE_CPU_WRITE)
               + dstSurf->offset;
 
+      /* XXX need to take stride/pitch info into account... */
       _mesa_generate_mipmap_level(target, datatype, comps,
                    0 /*border*/,
                    pt->width[srcLevel], pt->height[srcLevel], pt->depth[srcLevel],
-- 
cgit v1.2.3


From 3812bba8391fbf6c6c32a778ce0e1081825d5c52 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 10:59:40 -0700
Subject: gallium: take pitch/stride into account in mipmap generation

---
 src/mesa/main/mipmap.c                 | 85 +++++++++++++++++++++-------------
 src/mesa/main/mipmap.h                 |  2 +
 src/mesa/state_tracker/st_gen_mipmap.c |  3 +-
 3 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 721a227bb5..d3d1958951 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -447,23 +447,29 @@ make_1d_mipmap(GLenum datatype, GLuint comps, GLint border,
 
 
 /**
- * XXX need to use the tex image's row stride!
+ * Strides are in bytes.  If zero, it'll be computed as width * bpp.
  */
 static void
 make_2d_mipmap(GLenum datatype, GLuint comps, GLint border,
-               GLint srcWidth, GLint srcHeight, const GLubyte *srcPtr,
-               GLint dstWidth, GLint dstHeight, GLubyte *dstPtr)
+               GLint srcWidth, GLint srcHeight,
+               GLint srcRowStride, const GLubyte *srcPtr,
+               GLint dstWidth, GLint dstHeight,
+               GLint dstRowStride, GLubyte *dstPtr)
 {
    const GLint bpt = bytes_per_pixel(datatype, comps);
    const GLint srcWidthNB = srcWidth - 2 * border;  /* sizes w/out border */
    const GLint dstWidthNB = dstWidth - 2 * border;
    const GLint dstHeightNB = dstHeight - 2 * border;
-   const GLint srcRowStride = bpt * srcWidth;
-   const GLint dstRowStride = bpt * dstWidth;
    const GLubyte *srcA, *srcB;
    GLubyte *dst;
    GLint row;
 
+   if (!srcRowStride)
+      srcRowStride = bpt * srcWidth;
+
+   if (!dstRowStride)
+      dstRowStride = bpt * dstWidth;
+
    /* Compute src and dst pointers, skipping any border */
    srcA = srcPtr + border * ((srcWidth + 1) * bpt);
    if (srcHeight > 1) 
@@ -535,8 +541,10 @@ make_2d_mipmap(GLenum datatype, GLuint comps, GLint border,
 static void
 make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
                GLint srcWidth, GLint srcHeight, GLint srcDepth,
+               GLint srcRowStride,
                const GLubyte *srcPtr,
                GLint dstWidth, GLint dstHeight, GLint dstDepth,
+               GLint dstRowStride,
                GLubyte *dstPtr)
 {
    const GLint bpt = bytes_per_pixel(datatype, comps);
@@ -548,7 +556,6 @@ make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
    GLvoid *tmpRowA, *tmpRowB;
    GLint img, row;
    GLint bytesPerSrcImage, bytesPerDstImage;
-   GLint bytesPerSrcRow, bytesPerDstRow;
    GLint srcImageOffset, srcRowOffset;
 
    (void) srcDepthNB; /* silence warnings */
@@ -566,8 +573,10 @@ make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
    bytesPerSrcImage = srcWidth * srcHeight * bpt;
    bytesPerDstImage = dstWidth * dstHeight * bpt;
 
-   bytesPerSrcRow = srcWidth * bpt;
-   bytesPerDstRow = dstWidth * bpt;
+   if (!srcRowStride)
+      srcRowStride = srcWidth * bpt;
+   if (!dstRowStride)
+      dstRowStride = dstWidth * bpt;
 
    /* Offset between adjacent src images to be averaged together */
    srcImageOffset = (srcDepth == dstDepth) ? 0 : bytesPerSrcImage;
@@ -591,13 +600,13 @@ make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
    for (img = 0; img < dstDepthNB; img++) {
       /* first source image pointer, skipping border */
       const GLubyte *imgSrcA = srcPtr
-         + (bytesPerSrcImage + bytesPerSrcRow + border) * bpt * border
+         + (bytesPerSrcImage + srcRowStride + border) * bpt * border
          + img * (bytesPerSrcImage + srcImageOffset);
       /* second source image pointer, skipping border */
       const GLubyte *imgSrcB = imgSrcA + srcImageOffset;
       /* address of the dest image, skipping border */
       GLubyte *imgDst = dstPtr
-         + (bytesPerDstImage + bytesPerDstRow + border) * bpt * border
+         + (bytesPerDstImage + dstRowStride + border) * bpt * border
          + img * bytesPerDstImage;
 
       /* setup the four source row pointers and the dest row pointer */
@@ -618,11 +627,11 @@ make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
          do_row(datatype, comps, srcWidthNB, tmpRowA, tmpRowB,
                 dstWidthNB, dstImgRow);
          /* advance to next rows */
-         srcImgARowA += bytesPerSrcRow + srcRowOffset;
-         srcImgARowB += bytesPerSrcRow + srcRowOffset;
-         srcImgBRowA += bytesPerSrcRow + srcRowOffset;
-         srcImgBRowB += bytesPerSrcRow + srcRowOffset;
-         dstImgRow += bytesPerDstRow;
+         srcImgARowA += srcRowStride + srcRowOffset;
+         srcImgARowB += srcRowStride + srcRowOffset;
+         srcImgBRowA += srcRowStride + srcRowOffset;
+         srcImgBRowB += srcRowStride + srcRowOffset;
+         dstImgRow += dstRowStride;
       }
    }
 
@@ -632,12 +641,14 @@ make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
    /* Luckily we can leverage the make_2d_mipmap() function here! */
    if (border > 0) {
       /* do front border image */
-      make_2d_mipmap(datatype, comps, 1, srcWidth, srcHeight, srcPtr,
-                     dstWidth, dstHeight, dstPtr);
+      make_2d_mipmap(datatype, comps, 1, srcWidth, srcHeight, 0, srcPtr,
+                     dstWidth, dstHeight, 0, dstPtr);
       /* do back border image */
       make_2d_mipmap(datatype, comps, 1, srcWidth, srcHeight,
+                     0, 
                      srcPtr + bytesPerSrcImage * (srcDepth - 1),
                      dstWidth, dstHeight,
+                     0,
                      dstPtr + bytesPerDstImage * (dstDepth - 1));
       /* do four remaining border edges that span the image slices */
       if (srcDepth == dstDepth) {
@@ -653,9 +664,9 @@ make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
 
             /* do border along [img][row=dstHeight-1][col=0] */
             src = srcPtr + (img * 2 + 1) * bytesPerSrcImage
-                         + (srcHeight - 1) * bytesPerSrcRow;
+                         + (srcHeight - 1) * srcRowStride;
             dst = dstPtr + (img + 1) * bytesPerDstImage
-                         + (dstHeight - 1) * bytesPerDstRow;
+                         + (dstHeight - 1) * dstRowStride;
             MEMCPY(dst, src, bpt);
 
             /* do border along [img][row=0][col=dstWidth-1] */
@@ -687,9 +698,9 @@ make_3d_mipmap(GLenum datatype, GLuint comps, GLint border,
 
             /* do border along [img][row=dstHeight-1][col=0] */
             src = srcPtr + (img * 2 + 1) * bytesPerSrcImage
-                         + (srcHeight - 1) * bytesPerSrcRow;
+                         + (srcHeight - 1) * srcRowStride;
             dst = dstPtr + (img + 1) * bytesPerDstImage
-                         + (dstHeight - 1) * bytesPerDstRow;
+                         + (dstHeight - 1) * dstRowStride;
             do_row(datatype, comps, 1, src, src + srcImageOffset, 1, dst);
 
             /* do border along [img][row=0][col=dstWidth-1] */
@@ -755,8 +766,11 @@ make_1d_stack_mipmap(GLenum datatype, GLuint comps, GLint border,
  */
 static void
 make_2d_stack_mipmap(GLenum datatype, GLuint comps, GLint border,
-                     GLint srcWidth, GLint srcHeight, const GLubyte *srcPtr,
+                     GLint srcWidth, GLint srcHeight,
+                     GLint srcRowStride,
+                     const GLubyte *srcPtr,
                      GLint dstWidth, GLint dstHeight, GLint dstDepth,
+                     GLint dstRowStride,
                      GLubyte *dstPtr)
 {
    const GLint bpt = bytes_per_pixel(datatype, comps);
@@ -764,13 +778,17 @@ make_2d_stack_mipmap(GLenum datatype, GLuint comps, GLint border,
    const GLint dstWidthNB = dstWidth - 2 * border;
    const GLint dstHeightNB = dstHeight - 2 * border;
    const GLint dstDepthNB = dstDepth - 2 * border;
-   const GLint srcRowStride = bpt * srcWidth;
-   const GLint dstRowStride = bpt * dstWidth;
    const GLubyte *srcA, *srcB;
    GLubyte *dst;
    GLint layer;
    GLint row;
 
+   if (!srcRowStride)
+      srcRowStride = bpt * srcWidth;
+
+   if (!dstRowStride)
+      dstRowStride = bpt * dstWidth;
+
    /* Compute src and dst pointers, skipping any border */
    srcA = srcPtr + border * ((srcWidth + 1) * bpt);
    if (srcHeight > 1) 
@@ -849,8 +867,10 @@ _mesa_generate_mipmap_level(GLenum target,
                             GLenum datatype, GLuint comps,
                             GLint border,
                             GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                            GLint srcRowStride,
                             const GLubyte *srcData,
                             GLint dstWidth, GLint dstHeight, GLint dstDepth,
+                            GLint dstRowStride,
                             GLubyte *dstData)
 {
    switch (target) {
@@ -867,13 +887,13 @@ _mesa_generate_mipmap_level(GLenum target,
    case GL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB:
    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB:
       make_2d_mipmap(datatype, comps, border,
-                     srcWidth, srcHeight, srcData,
-                     dstWidth, dstHeight, dstData);
+                     srcWidth, srcHeight, srcRowStride, srcData,
+                     dstWidth, dstHeight, dstRowStride, dstData);
       break;
    case GL_TEXTURE_3D:
       make_3d_mipmap(datatype, comps, border,
-                     srcWidth, srcHeight, srcDepth, srcData,
-                     dstWidth, dstHeight, dstDepth, dstData);
+                     srcWidth, srcHeight, srcDepth, srcRowStride, srcData,
+                     dstWidth, dstHeight, dstDepth, dstRowStride, dstData);
       break;
    case GL_TEXTURE_1D_ARRAY_EXT:
       make_1d_stack_mipmap(datatype, comps, border,
@@ -882,8 +902,8 @@ _mesa_generate_mipmap_level(GLenum target,
       break;
    case GL_TEXTURE_2D_ARRAY_EXT:
       make_2d_stack_mipmap(datatype, comps, border,
-                           srcWidth, srcHeight, srcData,
-                           dstWidth, dstHeight, dstDepth, dstData);
+                           srcWidth, srcHeight, srcRowStride, srcData,
+                           dstWidth, dstHeight, dstDepth, dstRowStride, dstData);
       break;
    case GL_TEXTURE_RECTANGLE_NV:
       /* no mipmaps, do nothing */
@@ -1110,9 +1130,10 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
          dstData = (GLubyte *) dstImage->Data;
       }
 
+      /* Note, 0 indicates default row strides */
       _mesa_generate_mipmap_level(target, datatype, comps, border,
-                                  srcWidth, srcHeight, srcDepth, srcData,
-                                  dstWidth, dstHeight, dstDepth, dstData);
+                                  srcWidth, srcHeight, srcDepth, 0, srcData,
+                                  dstWidth, dstHeight, dstDepth, 0, dstData);
 
       if (dstImage->IsCompressed) {
          GLubyte *temp;
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index b6491f5507..44ecdddb27 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -34,8 +34,10 @@ _mesa_generate_mipmap_level(GLenum target,
                             GLenum datatype, GLuint comps,
                             GLint border,
                             GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                            GLint srcRowStride,
                             const GLubyte *srcData,
                             GLint dstWidth, GLint dstHeight, GLint dstDepth,
+                            GLint dstRowStride,
                             GLubyte *dstData);
 
 
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index f6af37cfac..c152c59905 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -352,12 +352,13 @@ fallback_generate_mipmap(GLcontext *ctx, GLenum target,
                                          PIPE_BUFFER_USAGE_CPU_WRITE)
               + dstSurf->offset;
 
-      /* XXX need to take stride/pitch info into account... */
       _mesa_generate_mipmap_level(target, datatype, comps,
                    0 /*border*/,
                    pt->width[srcLevel], pt->height[srcLevel], pt->depth[srcLevel],
+                   srcSurf->pitch * srcSurf->cpp, /* stride in bytes */
                    srcData,
                    pt->width[dstLevel], pt->height[dstLevel], pt->depth[dstLevel],
+                   dstSurf->pitch * dstSurf->cpp, /* stride in bytes */
                    dstData);
 
       ws->buffer_unmap(ws, srcSurf->buffer);
-- 
cgit v1.2.3


From 5d1af60edb1dbdf69fbf08b93fe0781f33f075dd Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Tue, 12 Feb 2008 15:13:37 +1100
Subject: nv40: fix inline u08/u16 indices

---
 src/mesa/pipe/nv40/nv40_vbo.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
index 552058b3ae..4e9cdb4585 100644
--- a/src/mesa/pipe/nv40/nv40_vbo.c
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -235,9 +235,9 @@ nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 	}
 
 	while (count) {
-		push = MIN2(count, 2046);
+		push = MIN2(count, 2047 * 2);
 
-		BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push);
+		BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
 			OUT_RING((elts[i+1] << 16) | elts[i]);
 
@@ -260,9 +260,9 @@ nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 	}
 
 	while (count) {
-		push = MIN2(count, 2046);
+		push = MIN2(count, 2047 * 2);
 
-		BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push);
+		BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
 			OUT_RING((elts[i+1] << 16) | elts[i]);
 
-- 
cgit v1.2.3


From 09e23e077b2bc3dc9ec0ecd97e1043ee7f32f2bb Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 14:53:25 -0700
Subject: gallium: clean-up, simplification of mipmapped textures

Remove pipe_texture->first_level (always implicitly zero).  This means there's
never any unused mipmap levels at the top.
In the state tracker, we no longer re-layout mipmapped textures if the
MinLod/MaxLod texture parameters change.  It's up to the driver to obey the
pipe_sampler->min/max_lod clamps.
---
 src/mesa/pipe/i915simple/i915_state.c           |  7 ++++
 src/mesa/pipe/i915simple/i915_state_sampler.c   |  2 +-
 src/mesa/pipe/i915simple/i915_texture.c         | 30 +++++++--------
 src/mesa/pipe/i965simple/brw_tex_layout.c       |  8 ++--
 src/mesa/pipe/i965simple/brw_wm_surface_state.c |  2 +-
 src/mesa/pipe/p_state.h                         | 14 ++-----
 src/mesa/pipe/softpipe/sp_state_sampler.c       |  4 ++
 src/mesa/pipe/softpipe/sp_tex_sample.c          | 18 +++------
 src/mesa/pipe/softpipe/sp_texture.c             |  4 +-
 src/mesa/state_tracker/st_cb_texture.c          | 51 +++++++++----------------
 src/mesa/state_tracker/st_gen_mipmap.c          | 43 ++++++++++++---------
 src/mesa/state_tracker/st_texture.c             | 35 ++++++++++++-----
 src/mesa/state_tracker/st_texture.h             |  6 +--
 13 files changed, 115 insertions(+), 109 deletions(-)

diff --git a/src/mesa/pipe/i915simple/i915_state.c b/src/mesa/pipe/i915simple/i915_state.c
index 950ea52d60..abd5571b88 100644
--- a/src/mesa/pipe/i915simple/i915_state.c
+++ b/src/mesa/pipe/i915simple/i915_state.c
@@ -250,6 +250,13 @@ i915_create_sampler_state(struct pipe_context *pipe,
    if (sampler->normalized_coords)
       cso->state[1] |= SS3_NORMALIZED_COORDS;
 
+   if (0) /* XXX not tested yet */
+   {
+      int minlod = (int) (16.0 * sampler->min_lod);
+      minlod = CLAMP(minlod, 0, 16 * 11);
+      cso->state[1] |= (minlod << SS3_MIN_LOD_SHIFT);
+   }
+
    {
       ubyte r = float_to_ubyte(sampler->border_color[0]);
       ubyte g = float_to_ubyte(sampler->border_color[1]);
diff --git a/src/mesa/pipe/i915simple/i915_state_sampler.c b/src/mesa/pipe/i915simple/i915_state_sampler.c
index 0dbbc5241d..9c1a5bbbd6 100644
--- a/src/mesa/pipe/i915simple/i915_state_sampler.c
+++ b/src/mesa/pipe/i915simple/i915_state_sampler.c
@@ -185,7 +185,7 @@ i915_update_texture(struct i915_context *i915, uint unit,
    const struct pipe_texture *pt = &tex->base;
    uint format, pitch;
    const uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
-   const uint num_levels = pt->last_level - pt->first_level;
+   const uint num_levels = pt->last_level;
 
    assert(tex);
    assert(width);
diff --git a/src/mesa/pipe/i915simple/i915_texture.c b/src/mesa/pipe/i915simple/i915_texture.c
index 6faeab134a..6d37ae3d74 100644
--- a/src/mesa/pipe/i915simple/i915_texture.c
+++ b/src/mesa/pipe/i915simple/i915_texture.c
@@ -118,11 +118,11 @@ i945_miptree_layout_2d( struct i915_texture *tex )
    tex->pitch = pt->width[0];
 
    /* May need to adjust pitch to accomodate the placement of
-    * the 2nd mipmap.  This occurs when the alignment
+    * the 2nd mipmap level.  This occurs when the alignment
     * constraints of mipmap placement push the right edge of the
-    * 2nd mipmap out past the width of its parent.
+    * 2nd mipmap level out past the width of its parent.
     */
-   if (pt->first_level != pt->last_level) {
+   if (pt->last_level > 0) {
       unsigned mip1_width = align_int(minify(pt->width[0]), align_w)
 			+ minify(minify(pt->width[0]));
 
@@ -136,7 +136,7 @@ i945_miptree_layout_2d( struct i915_texture *tex )
    tex->pitch = align_int(tex->pitch * pt->cpp, 4) / pt->cpp;
    tex->total_height = 0;
 
-   for ( level = pt->first_level ; level <= pt->last_level ; level++ ) {
+   for (level = 0; level <= pt->last_level; level++) {
       unsigned img_height;
 
       i915_miptree_set_level_info(tex, level, 1, x, y, width, height, 1);
@@ -152,9 +152,9 @@ i945_miptree_layout_2d( struct i915_texture *tex )
        */
       tex->total_height = MAX2(tex->total_height, y + img_height);
 
-      /* Layout_below: step right after second mipmap.
+      /* Layout_below: step right after second mipmap level.
        */
-      if (level == pt->first_level + 1) {
+      if (level == 1) {
 	 x += align_int(width, align_w);
       }
       else {
@@ -204,7 +204,7 @@ i915_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
          tex->pitch = ((dim * pt->cpp * 2 + 3) & ~3) / pt->cpp;
          tex->total_height = dim * 4;
 
-         for (level = pt->first_level; level <= pt->last_level; level++) {
+         for (level = 0; level <= pt->last_level; level++) {
             i915_miptree_set_level_info(tex, level, 6,
                                          0, 0,
                                          /*OLD: tex->pitch, tex->total_height,*/
@@ -219,7 +219,7 @@ i915_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
             unsigned y = initial_offsets[face][1] * dim;
             unsigned d = dim;
 
-            for (level = pt->first_level; level <= pt->last_level; level++) {
+            for (level = 0; level <= pt->last_level; level++) {
                i915_miptree_set_image_offset(tex, level, face, x, y);
                d >>= 1;
                x += step_offsets[face][0] * d;
@@ -240,7 +240,7 @@ i915_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
 
          /* XXX: hardware expects/requires 9 levels at minimum.
           */
-         for (level = pt->first_level; level <= MAX2(8, pt->last_level);
+         for (level = 0; level <= MAX2(8, pt->last_level);
               level++) {
             i915_miptree_set_level_info(tex, level, depth, 0, tex->total_height,
                                          width, height, depth);
@@ -256,7 +256,7 @@ i915_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
          /* Fixup depth image_offsets: 
           */
          depth = pt->depth[0];
-         for (level = pt->first_level; level <= pt->last_level; level++) {
+         for (level = 0; level <= pt->last_level; level++) {
             unsigned i;
             for (i = 0; i < depth; i++) 
                i915_miptree_set_image_offset(tex, level, i,
@@ -282,7 +282,7 @@ i915_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
          tex->pitch = ((pt->width[0] * pt->cpp + 3) & ~3) / pt->cpp;
          tex->total_height = 0;
 
-         for (level = pt->first_level; level <= pt->last_level; level++) {
+         for (level = 0; level <= pt->last_level; level++) {
             i915_miptree_set_level_info(tex, level, 1,
                                          0, tex->total_height,
                                          width, height, 1);
@@ -337,7 +337,7 @@ i945_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
 
          /* Set all the levels to effectively occupy the whole rectangular region. 
           */
-         for (level = pt->first_level; level <= pt->last_level; level++) {
+         for (level = 0; level <= pt->last_level; level++) {
             i915_miptree_set_level_info(tex, level, 6,
                                          0, 0,
                                          lvlWidth, lvlHeight, 1);
@@ -355,12 +355,12 @@ i945_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
                y = tex->total_height - 4;
                x = (face - 4) * 8;
             }
-            else if (dim < 4 && (face > 0 || pt->first_level > 0)) {
+            else if (dim < 4 && (face > 0)) {
                y = tex->total_height - 4;
                x = face * 8;
             }
 
-            for (level = pt->first_level; level <= pt->last_level; level++) {
+            for (level = 0; level <= pt->last_level; level++) {
                i915_miptree_set_image_offset(tex, level, face, x, y);
 
                d >>= 1;
@@ -418,7 +418,7 @@ i945_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
          pack_x_pitch = tex->pitch;
          pack_x_nr = 1;
 
-         for (level = pt->first_level; level <= pt->last_level; level++) {
+         for (level = 0; level <= pt->last_level; level++) {
             unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
             int x = 0;
             int y = 0;
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.c b/src/mesa/pipe/i965simple/brw_tex_layout.c
index 405fd1f794..90561f1307 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.c
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.c
@@ -146,7 +146,7 @@ static void i945_miptree_layout_2d(struct brw_texture *tex)
     * constraints of mipmap placement push the right edge of the
     * 2nd mipmap out past the width of its parent.
     */
-   if (pt->first_level != pt->last_level) {
+   if (pt->last_level > 0) {
       unsigned mip1_width;
 
       if (pt->compressed) {
@@ -168,7 +168,7 @@ static void i945_miptree_layout_2d(struct brw_texture *tex)
    tex->pitch = align(tex->pitch * pt->cpp, 4) / pt->cpp;
    tex->total_height = 0;
 
-   for ( level = pt->first_level ; level <= pt->last_level ; level++ ) {
+   for (level = 0; level <= pt->last_level; level++) {
       unsigned img_height;
 
       intel_miptree_set_level_info(tex, level, 1, x, y, width,
@@ -187,7 +187,7 @@ static void i945_miptree_layout_2d(struct brw_texture *tex)
 
       /* Layout_below: step right after second mipmap.
        */
-      if (level == pt->first_level + 1) {
+      if (level == 1) {
 	 x += align(width, align_w);
       }
       else {
@@ -234,7 +234,7 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
       pack_x_pitch = tex->pitch;
       pack_x_nr = 1;
 
-      for ( level = pt->first_level ; level <= pt->last_level ; level++ ) {
+      for (level = 0; level <= pt->last_level; level++) {
 	 unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
 	 int x = 0;
 	 int y = 0;
diff --git a/src/mesa/pipe/i965simple/brw_wm_surface_state.c b/src/mesa/pipe/i965simple/brw_wm_surface_state.c
index cbb4f2efd3..d16d919bce 100644
--- a/src/mesa/pipe/i965simple/brw_wm_surface_state.c
+++ b/src/mesa/pipe/i965simple/brw_wm_surface_state.c
@@ -154,7 +154,7 @@ void brw_update_texture_surface( struct brw_context *brw,
    /* Updated in emit_reloc */
    surf.ss1.base_addr = brw_buffer_offset( brw, tObj->buffer );
 
-   surf.ss2.mip_count = tObj->base.last_level - tObj->base.first_level;
+   surf.ss2.mip_count = tObj->base.last_level;
    surf.ss2.width = tObj->base.width[0] - 1;
    surf.ss2.height = tObj->base.height[0] - 1;
 
diff --git a/src/mesa/pipe/p_state.h b/src/mesa/pipe/p_state.h
index 83ca43f678..4d3a6b2f41 100644
--- a/src/mesa/pipe/p_state.h
+++ b/src/mesa/pipe/p_state.h
@@ -234,14 +234,9 @@ struct pipe_sampler_state
    unsigned compare_mode:1;  /**< PIPE_TEX_COMPARE_x */
    unsigned compare_func:3;  /**< PIPE_FUNC_x */
    unsigned normalized_coords:1;  /**< Are coords normalized to [0,1]? */
-   float shadow_ambient; /**< shadow test fail color/intensity */
-   float min_lod;
-   float max_lod;
-   float lod_bias;
-#if 0 /* need these? */
-   int BaseLevel;     /**< min mipmap level, OpenGL 1.2 */
-   int MaxLevel;      /**< max mipmap level, OpenGL 1.2 */
-#endif
+   float shadow_ambient;          /**< shadow test fail color/intensity */
+   float lod_bias;                /**< LOD/lambda bias */
+   float min_lod, max_lod;        /**< LOD clamp range, after bias */
    float border_color[4];
    float max_anisotropy;
 };
@@ -277,8 +272,7 @@ struct pipe_texture
    enum pipe_texture_target target; /**< PIPE_TEXTURE_x */
    enum pipe_format format;         /**< PIPE_FORMAT_x */
 
-   unsigned first_level;
-   unsigned last_level;
+   unsigned last_level;    /**< Index of last mipmap level present/defined */
 
    unsigned width[PIPE_MAX_TEXTURE_LEVELS];
    unsigned height[PIPE_MAX_TEXTURE_LEVELS];
diff --git a/src/mesa/pipe/softpipe/sp_state_sampler.c b/src/mesa/pipe/softpipe/sp_state_sampler.c
index 291bbc40ad..6a5a643c89 100644
--- a/src/mesa/pipe/softpipe/sp_state_sampler.c
+++ b/src/mesa/pipe/softpipe/sp_state_sampler.c
@@ -34,6 +34,8 @@
 #include "sp_state.h"
 #include "sp_texture.h"
 #include "sp_tile_cache.h"
+#include "pipe/draw/draw_context.h"
+
 
 
 void *
@@ -73,6 +75,8 @@ softpipe_set_sampler_texture(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    softpipe->texture[unit] = softpipe_texture(texture);  /* ptr, not struct */
 
diff --git a/src/mesa/pipe/softpipe/sp_tex_sample.c b/src/mesa/pipe/softpipe/sp_tex_sample.c
index 5e215c433a..325bdb86da 100644
--- a/src/mesa/pipe/softpipe/sp_tex_sample.c
+++ b/src/mesa/pipe/softpipe/sp_tex_sample.c
@@ -449,7 +449,6 @@ compute_lambda(struct tgsi_sampler *sampler,
    }
 
    lambda = LOG2(rho);
-
    lambda += lodbias + sampler->state->lod_bias;
    lambda = CLAMP(lambda, sampler->state->min_lod, sampler->state->max_lod);
 
@@ -457,7 +456,6 @@ compute_lambda(struct tgsi_sampler *sampler,
 }
 
 
-
 /**
  * Do several things here:
  * 1. Compute lambda from the texcoords, if needed
@@ -477,7 +475,7 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
    if (sampler->state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
       /* no mipmap selection needed */
       *imgFilter = sampler->state->mag_img_filter;
-      *level0 = *level1 = sampler->texture->first_level;
+      *level0 = *level1 = (int) sampler->state->min_lod;
    }
    else {
       float lambda;
@@ -492,7 +490,7 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
       if (lambda < 0.0) { /* XXX threshold depends on the filter */
          /* magnifying */
          *imgFilter = sampler->state->mag_img_filter;
-         *level0 = *level1 = sampler->texture->first_level;
+         *level0 = *level1 = 0;
       }
       else {
          /* minifying */
@@ -503,19 +501,13 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
             /* Nearest mipmap level */
             const int lvl = (int) (lambda + 0.5);
             *level0 =
-            *level1 = CLAMP(lvl,
-                            (int) sampler->texture->first_level,
-                            (int) sampler->texture->last_level);
+            *level1 = CLAMP(lvl, 0, (int) sampler->texture->last_level);
          }
          else {
             /* Linear interpolation between mipmap levels */
             const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,
-                            (int) sampler->texture->first_level,
-                            (int) sampler->texture->last_level);
-            *level1 = CLAMP(lvl + 1,
-                            (int) sampler->texture->first_level,
-                            (int) sampler->texture->last_level);
+            *level0 = CLAMP(lvl,     0, (int) sampler->texture->last_level);
+            *level1 = CLAMP(lvl + 1, 0, (int) sampler->texture->last_level);
             *levelBlend = FRAC(lambda);  /* blending weight between levels */
          }
       }
diff --git a/src/mesa/pipe/softpipe/sp_texture.c b/src/mesa/pipe/softpipe/sp_texture.c
index fd2cc3dbbb..6de7a9b543 100644
--- a/src/mesa/pipe/softpipe/sp_texture.c
+++ b/src/mesa/pipe/softpipe/sp_texture.c
@@ -61,7 +61,7 @@ softpipe_texture_layout(struct softpipe_texture * spt)
 
    spt->buffer_size = 0;
 
-   for ( level = pt->first_level ; level <= pt->last_level ; level++ ) {
+   for (level = 0; level <= pt->last_level; level++) {
       pt->width[level] = width;
       pt->height[level] = height;
       pt->depth[level] = depth;
@@ -139,6 +139,8 @@ softpipe_get_tex_surface(struct pipe_context *pipe,
    struct softpipe_texture *spt = softpipe_texture(pt);
    struct pipe_surface *ps;
 
+   assert(level <= pt->last_level);
+
    ps = pipe->winsys->surface_alloc(pipe->winsys);
    if (ps) {
       assert(ps->refcount);
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 7099ec33b9..f012b2f1a0 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -57,15 +57,10 @@ struct st_texture_object
 {
    struct gl_texture_object base;       /* The "parent" object */
 
-   /* The texture must include at least these levels once validated:
+   /* The texture must include at levels [0..lastLevel] once validated:
     */
-   GLuint firstLevel;
    GLuint lastLevel;
 
-   /* Offset for firstLevel image:
-    */
-   GLuint textureOffset;
-
    /* On validation any active images held in main memory or in other
     * textures will be copied to this texture and the old storage freed.
     */
@@ -585,12 +580,12 @@ st_TexImage(GLcontext * ctx,
       _mesa_align_free(texImage->Data);
    }
 
-   /* If this is the only texture image in the texture, could call
+   /* If this is the only mipmap level in the texture, could call
     * bmBufferData with NULL data to free the old block and avoid
     * waiting on any outstanding fences.
     */
    if (stObj->pt &&
-       stObj->pt->first_level == level &&
+       /*stObj->pt->first_level == level &&*/
        stObj->pt->last_level == level &&
        stObj->pt->target != PIPE_TEXTURE_CUBE &&
        !st_texture_match_image(stObj->pt, &stImage->base,
@@ -1363,13 +1358,8 @@ calculate_first_last_level(struct st_texture_object *stObj)
          firstLevel = lastLevel = tObj->BaseLevel;
       }
       else {
-         firstLevel = tObj->BaseLevel + (GLint) (tObj->MinLod + 0.5);
-         firstLevel = MAX2(firstLevel, tObj->BaseLevel);
-         lastLevel = tObj->BaseLevel + (GLint) (tObj->MaxLod + 0.5);
-         lastLevel = MAX2(lastLevel, tObj->BaseLevel);
-         lastLevel = MIN2(lastLevel, tObj->BaseLevel + baseImage->MaxLog2);
-         lastLevel = MIN2(lastLevel, tObj->MaxLevel);
-         lastLevel = MAX2(firstLevel, lastLevel);       /* need at least one level */
+         firstLevel = 0;
+         lastLevel = MIN2(tObj->MaxLevel - tObj->BaseLevel, baseImage->MaxLog2);
       }
       break;
    case GL_TEXTURE_RECTANGLE_NV:
@@ -1380,8 +1370,6 @@ calculate_first_last_level(struct st_texture_object *stObj)
       return;
    }
 
-   /* save these values */
-   stObj->firstLevel = firstLevel;
    stObj->lastLevel = lastLevel;
 }
 
@@ -1389,15 +1377,16 @@ calculate_first_last_level(struct st_texture_object *stObj)
 static void
 copy_image_data_to_texture(struct st_context *st,
 			   struct st_texture_object *stObj,
+                           GLuint dstLevel,
 			   struct st_texture_image *stImage)
 {
    if (stImage->pt) {
       /* Copy potentially with the blitter:
        */
       st_texture_image_copy(st->pipe,
-                            stObj->pt,  /* dest texture */
-                            stImage->face, stImage->level,
-                            stImage->pt /* src texture */
+                            stObj->pt, dstLevel,  /* dest texture, level */
+                            stImage->pt, /* src texture */
+                            stImage->face
                             );
 
       st->pipe->texture_release(st->pipe, &stImage->pt);
@@ -1438,7 +1427,7 @@ st_finalize_texture(GLcontext *ctx,
    const GLuint nr_faces = (stObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
    int comp_byte = 0;
    int cpp;
-   GLuint face, i;
+   GLuint face;
    struct st_texture_image *firstImage;
 
    *needFlush = GL_FALSE;
@@ -1450,7 +1439,7 @@ st_finalize_texture(GLcontext *ctx,
    /* What levels must the texture include at a minimum?
     */
    calculate_first_last_level(stObj);
-   firstImage = st_texture_image(stObj->base.Image[0][stObj->firstLevel]);
+   firstImage = st_texture_image(stObj->base.Image[0][stObj->base.BaseLevel]);
 
    /* Fallback case:
     */
@@ -1469,7 +1458,6 @@ st_finalize_texture(GLcontext *ctx,
     */
    if (firstImage->pt &&
        firstImage->pt != stObj->pt &&
-       firstImage->pt->first_level <= stObj->firstLevel &&
        firstImage->pt->last_level >= stObj->lastLevel) {
 
       if (stObj->pt)
@@ -1488,18 +1476,11 @@ st_finalize_texture(GLcontext *ctx,
 
    /* Check texture can hold all active levels.  Check texture matches
     * target, imageFormat, etc.
-    * 
-    * XXX: For some layouts (eg i945?), the test might have to be
-    * first_level == firstLevel, as the texture isn't valid except at the
-    * original start level.  Hope to get around this by
-    * programming minLod, maxLod, baseLevel into the hardware and
-    * leaving the texture alone.
     */
    if (stObj->pt &&
        (stObj->pt->target != gl_target_to_pipe(stObj->base.Target) ||
 	stObj->pt->format !=
 	st_mesa_format_to_pipe_format(firstImage->base.TexFormat->MesaFormat) ||
-	stObj->pt->first_level != stObj->firstLevel ||
 	stObj->pt->last_level != stObj->lastLevel ||
 	stObj->pt->width[0] != firstImage->base.Width ||
 	stObj->pt->height[0] != firstImage->base.Height ||
@@ -1516,7 +1497,7 @@ st_finalize_texture(GLcontext *ctx,
       stObj->pt = st_texture_create(ctx->st,
                                     gl_target_to_pipe(stObj->base.Target),
                                     st_mesa_format_to_pipe_format(firstImage->base.TexFormat->MesaFormat),
-                                    stObj->firstLevel,
+                                    0, /* first level */
                                     stObj->lastLevel,
                                     firstImage->base.Width,
                                     firstImage->base.Height,
@@ -1527,14 +1508,16 @@ st_finalize_texture(GLcontext *ctx,
    /* Pull in any images not in the object's texture:
     */
    for (face = 0; face < nr_faces; face++) {
-      for (i = stObj->firstLevel; i <= stObj->lastLevel; i++) {
+      GLuint level;
+      for (level = 0; level <= stObj->lastLevel; level++) {
          struct st_texture_image *stImage =
-            st_texture_image(stObj->base.Image[face][i]);
+            //st_texture_image(stObj->base.Image[face][level]);
+            st_texture_image(stObj->base.Image[face][stObj->base.BaseLevel + level]);
 
          /* Need to import images in main memory or held in other textures.
           */
          if (stObj->pt != stImage->pt) {
-            copy_image_data_to_texture(ctx->st, stObj, stImage);
+            copy_image_data_to_texture(ctx->st, stObj, level, stImage);
 	    *needFlush = GL_TRUE;
          }
       }
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index c152c59905..fd7d8cefea 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -51,7 +51,6 @@
 static void *blend_cso = NULL;
 static void *depthstencil_cso = NULL;
 static void *rasterizer_cso = NULL;
-static void *sampler_cso = NULL;
 
 static struct st_fragment_program *stfp = NULL;
 static struct st_vertex_program *stvp = NULL;
@@ -118,7 +117,6 @@ st_init_generate_mipmap(struct st_context *st)
    struct pipe_context *pipe = st->pipe;
    struct pipe_blend_state blend;
    struct pipe_rasterizer_state rasterizer;
-   struct pipe_sampler_state sampler;
    struct pipe_depth_stencil_alpha_state depthstencil;
 
    assert(!blend_cso);
@@ -133,16 +131,6 @@ st_init_generate_mipmap(struct st_context *st)
    memset(&rasterizer, 0, sizeof(rasterizer));
    rasterizer_cso = pipe->create_rasterizer_state(pipe, &rasterizer);
 
-   memset(&sampler, 0, sizeof(sampler));
-   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
-   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
-   sampler.normalized_coords = 1;
-   sampler_cso = pipe->create_sampler_state(pipe, &sampler);
-
    stfp = make_tex_fragment_program(st->ctx);
    stvp = st_make_passthrough_vertex_shader(st, GL_FALSE);
 }
@@ -156,14 +144,12 @@ st_destroy_generate_mipmpap(struct st_context *st)
    pipe->delete_blend_state(pipe, blend_cso);
    pipe->delete_depth_stencil_alpha_state(pipe, depthstencil_cso);
    pipe->delete_rasterizer_state(pipe, rasterizer_cso);
-   pipe->delete_sampler_state(pipe, sampler_cso);
 
    /* XXX free stfp, stvp */
 
    blend_cso = NULL;
    depthstencil_cso = NULL;
    rasterizer_cso = NULL;
-   sampler_cso = NULL;
 }
 
 
@@ -248,8 +234,10 @@ st_render_mipmap(struct st_context *st,
 {
    struct pipe_context *pipe = st->pipe;
    struct pipe_framebuffer_state fb;
+   struct pipe_sampler_state sampler;
+   void *sampler_cso;
    const uint face = _mesa_tex_target_to_face(target), zslice = 0;
-   const uint first_level_save = pt->first_level;
+   /*const uint first_level_save = pt->first_level;*/
    uint dstLevel;
 
    assert(target != GL_TEXTURE_3D); /* not done yet */
@@ -263,11 +251,21 @@ st_render_mipmap(struct st_context *st,
    memset(&fb, 0, sizeof(fb));
    fb.num_cbufs = 1;
 
+   /* sampler state */
+   memset(&sampler, 0, sizeof(sampler));
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.normalized_coords = 1;
+
+
    /* bind CSOs */
    pipe->bind_blend_state(pipe, blend_cso);
    pipe->bind_depth_stencil_alpha_state(pipe, depthstencil_cso);
    pipe->bind_rasterizer_state(pipe, rasterizer_cso);
-   pipe->bind_sampler_state(pipe, 0, sampler_cso);
 
    /* bind shaders */
    pipe->bind_fs_state(pipe, stfp->fs->data);
@@ -286,20 +284,29 @@ st_render_mipmap(struct st_context *st,
       fb.cbufs[0] = pipe->get_tex_surface(pipe, pt, face, dstLevel, zslice);
       pipe->set_framebuffer_state(pipe, &fb);
 
+      /*
+       * Setup sampler state
+       */
+      sampler.min_lod = sampler.max_lod = srcLevel;
+      sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+      pipe->bind_sampler_state(pipe, 0, sampler_cso);
+
       simple_viewport(pipe, pt->width[dstLevel], pt->height[dstLevel]);
 
       /*
        * Setup src texture, override pt->first_level so we sample from
        * the right mipmap level.
        */
-      pt->first_level = srcLevel;
+      /*pt->first_level = srcLevel;*/
       pipe->set_sampler_texture(pipe, 0, pt);
 
       draw_quad(st->ctx);
+
+      pipe->delete_sampler_state(pipe, sampler_cso);
    }
 
    /* restore first_level */
-   pt->first_level = first_level_save;
+   /*pt->first_level = first_level_save;*/
 
    /* restore pipe state */
    if (st->state.rasterizer)
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 844a9f80d8..ff33da6f9e 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -62,6 +62,10 @@ target_to_target(GLenum target)
 
 /**
  * Allocate a new pipe_texture object
+ * width0, height0, depth0 are the dimensions of the level 0 image
+ * (the highest resolution).  last_level indicates how many mipmap levels
+ * to allocate storage for.  For non-mipmapped textures, this will be zero.
+ * XXX first_level obsolete
  */
 struct pipe_texture *
 st_texture_create(struct st_context *st,
@@ -84,9 +88,9 @@ st_texture_create(struct st_context *st,
 
    assert(format);
 
+   memset(&pt, 0, sizeof(pt));
    pt.target = target;
    pt.format = format;
-   pt.first_level = first_level;
    pt.last_level = last_level;
    pt.width[0] = width0;
    pt.height[0] = height0;
@@ -266,23 +270,36 @@ st_texture_image_data(struct pipe_context *pipe,
  */
 void
 st_texture_image_copy(struct pipe_context *pipe,
-                      struct pipe_texture *dst,
-                      GLuint face, GLuint level,
-                      struct pipe_texture *src)
+                      struct pipe_texture *dst, GLuint dstLevel,
+                      struct pipe_texture *src,
+                      GLuint face)
 {
-   GLuint width = src->width[level];
-   GLuint height = src->height[level];
-   GLuint depth = src->depth[level];
+   GLuint width = dst->width[dstLevel];
+   GLuint height = dst->height[dstLevel];
+   GLuint depth = dst->depth[dstLevel];
    struct pipe_surface *src_surface;
    struct pipe_surface *dst_surface;
    GLuint i;
 
+   /* XXX this is a hack */
    if (dst->compressed)
       height /= 4;
 
    for (i = 0; i < depth; i++) {
-      dst_surface = pipe->get_tex_surface(pipe, dst, face, level, i);
-      src_surface = pipe->get_tex_surface(pipe, src, face, level, i);
+      GLuint srcLevel;
+
+      /* find src texture level of needed size */
+      for (srcLevel = 0; srcLevel <= src->last_level; srcLevel++) {
+         if (src->width[srcLevel] == width &&
+             src->height[srcLevel] == height) {
+            break;
+         }
+      }
+      assert(src->width[srcLevel] == width);
+      assert(src->height[srcLevel] == height);
+
+      dst_surface = pipe->get_tex_surface(pipe, dst, face, dstLevel, i);
+      src_surface = pipe->get_tex_surface(pipe, src, face, srcLevel, i);
 
       pipe->surface_copy(pipe,
 			 dst_surface,
diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
index 0b87a494c3..6c5f0930fa 100644
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -98,9 +98,9 @@ st_texture_image_data(struct pipe_context *pipe,
  */
 extern void
 st_texture_image_copy(struct pipe_context *pipe,
-                      struct pipe_texture *dst,
-                      GLuint face, GLuint level,
-                      struct pipe_texture *src);
+                      struct pipe_texture *dst, GLuint dstLevel,
+                      struct pipe_texture *src,
+                      GLuint face);
 
 
 #endif
-- 
cgit v1.2.3


From 7d99bac7d6e905b8851216f7d74a583e0f087e1b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 14:55:51 -0700
Subject: gallium: remove unused first_level param from st_texture_create()

---
 src/mesa/state_tracker/st_cb_drawpixels.c | 6 +++---
 src/mesa/state_tracker/st_cb_texture.c    | 2 --
 src/mesa/state_tracker/st_texture.c       | 6 ++----
 src/mesa/state_tracker/st_texture.h       | 1 -
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 07886e7982..475e23653e 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -471,7 +471,7 @@ make_texture(struct st_context *st,
    assert(pipeFormat);
    cpp = st_sizeof_format(pipeFormat);
 
-   pt = st_texture_create(st, PIPE_TEXTURE_2D, pipeFormat, 0, 0, width, height,
+   pt = st_texture_create(st, PIPE_TEXTURE_2D, pipeFormat, 0, width, height,
 			  1, 0);
    if (!pt)
       return NULL;
@@ -1017,7 +1017,7 @@ make_bitmap_texture(GLcontext *ctx, GLsizei width, GLsizei height,
    /**
     * Create a texture.
     */
-   pt = st_texture_create(ctx->st, PIPE_TEXTURE_2D, format, 0, 0, width, height,
+   pt = st_texture_create(ctx->st, PIPE_TEXTURE_2D, format, 0, width, height,
 			  1, 0);
    if (!pt)
       return NULL;
@@ -1241,7 +1241,7 @@ st_CopyPixels(GLcontext *ctx, GLint srcx, GLint srcy,
    psRead = rbRead->surface;
    format = psRead->format;
 
-   pt = st_texture_create(ctx->st, PIPE_TEXTURE_2D, format, 0, 0, width, height,
+   pt = st_texture_create(ctx->st, PIPE_TEXTURE_2D, format, 0, width, height,
 			  1, 0);
    if (!pt)
       return;
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index f012b2f1a0..fab9eafc7f 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -363,7 +363,6 @@ guess_and_alloc_texture(struct st_context *st,
    stObj->pt = st_texture_create(st,
                                  gl_target_to_pipe(stObj->base.Target),
                                  st_mesa_format_to_pipe_format(stImage->base.TexFormat->MesaFormat),
-                                 firstLevel,
                                  lastLevel,
                                  width,
                                  height,
@@ -1497,7 +1496,6 @@ st_finalize_texture(GLcontext *ctx,
       stObj->pt = st_texture_create(ctx->st,
                                     gl_target_to_pipe(stObj->base.Target),
                                     st_mesa_format_to_pipe_format(firstImage->base.TexFormat->MesaFormat),
-                                    0, /* first level */
                                     stObj->lastLevel,
                                     firstImage->base.Width,
                                     firstImage->base.Height,
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index ff33da6f9e..2622d00953 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -65,13 +65,11 @@ target_to_target(GLenum target)
  * width0, height0, depth0 are the dimensions of the level 0 image
  * (the highest resolution).  last_level indicates how many mipmap levels
  * to allocate storage for.  For non-mipmapped textures, this will be zero.
- * XXX first_level obsolete
  */
 struct pipe_texture *
 st_texture_create(struct st_context *st,
                   enum pipe_texture_target target,
 		  enum pipe_format format,
-		  GLuint first_level,
 		  GLuint last_level,
 		  GLuint width0,
 		  GLuint height0,
@@ -82,9 +80,9 @@ st_texture_create(struct st_context *st,
 
    assert(target <= PIPE_TEXTURE_CUBE);
 
-   DBG("%s target %s format %s level %d..%d\n", __FUNCTION__,
+   DBG("%s target %s format %s last_level %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target),
-       _mesa_lookup_enum_by_nr(format), first_level, last_level);
+       _mesa_lookup_enum_by_nr(format), last_level);
 
    assert(format);
 
diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
index 6c5f0930fa..72324cd9ab 100644
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -39,7 +39,6 @@ extern struct pipe_texture *
 st_texture_create(struct st_context *st,
                   enum pipe_texture_target target,
 		  enum pipe_format format,
-                  GLuint first_level,
                   GLuint last_level,
                   GLuint width0,
                   GLuint height0,
-- 
cgit v1.2.3


From 7eb2cd3427940c914d2bbc0c1e901b5c81ff50d5 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 14:59:07 -0700
Subject: tweak incr/decrements amounts for keyboard options

---
 progs/tests/mipmap_limits.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/progs/tests/mipmap_limits.c b/progs/tests/mipmap_limits.c
index dc066cab1f..b0a3be99ff 100644
--- a/progs/tests/mipmap_limits.c
+++ b/progs/tests/mipmap_limits.c
@@ -194,22 +194,22 @@ key(unsigned char k, int x, int y)
         MaxLevel = 10;
      break;
   case 'l':
-     LodBias -= 0.02;
+     LodBias -= 0.25;
      break;
   case 'L':
-     LodBias += 0.02;
+     LodBias += 0.25;
      break;
   case 'n':
-     MinLod -= 0.02;
+     MinLod -= 0.25;
      break;
   case 'N':
-     MinLod += 0.02;
+     MinLod += 0.25;
      break;
   case 'x':
-     MaxLod -= 0.02;
+     MaxLod -= 0.25;
      break;
   case 'X':
-     MaxLod += 0.02;
+     MaxLod += 0.25;
      break;
   case 'f':
      NearestFilter = !NearestFilter;
-- 
cgit v1.2.3


From 5b80529b6081bcff42ec20a096506c441729d39c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 11 Feb 2008 09:46:10 -0700
Subject: press 0,1,2,etc keys for specific bias values

---
 progs/demos/lodbias.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/progs/demos/lodbias.c b/progs/demos/lodbias.c
index a4db22e26e..c5a2a1b457 100644
--- a/progs/demos/lodbias.c
+++ b/progs/demos/lodbias.c
@@ -159,6 +159,18 @@ static void Key( unsigned char key, int x, int y )
       case 'B':
          Bias += 10;
          break;
+      case '0':
+      case '1':
+      case '2':
+      case '3':
+      case '4':
+      case '5':
+      case '6':
+      case '7':
+      case '8':
+      case '9':
+         Bias = 100.0 * (key - '0');
+         break;
       case 27:
          exit(0);
          break;
-- 
cgit v1.2.3


From 7057a031f196f677366d0d397951aa87932ac887 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 15:53:37 -0700
Subject: Set Min/MaxLod with glTexParameterf, not glTexParameteri

---
 progs/tests/mipmap_limits.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/progs/tests/mipmap_limits.c b/progs/tests/mipmap_limits.c
index b0a3be99ff..d6d6e467b0 100644
--- a/progs/tests/mipmap_limits.c
+++ b/progs/tests/mipmap_limits.c
@@ -131,8 +131,8 @@ static void display(void)
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, BaseLevel);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, MaxLevel);
 
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_LOD, MinLod);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LOD, MaxLod);
+    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_LOD, MinLod);
+    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_LOD, MaxLod);
 
     if (NearestFilter) {
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-- 
cgit v1.2.3


From aa31fe3b54592b1c017e0389de990040f2ad18f4 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 16:06:51 -0700
Subject: gallium: move gen-mipmap global/static vars into st_context

This fixes potential problems with multi-context programs.
---
 src/mesa/state_tracker/st_context.h    |  9 ++++++++
 src/mesa/state_tracker/st_gen_mipmap.c | 41 +++++++++++-----------------------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index a756055898..59d1590f05 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -166,6 +166,15 @@ struct st_context
       struct st_fragment_program *combined_prog;
    } bitmap;
 
+   /** For gen/render mipmap feature */
+   struct {
+      void *blend_cso;
+      void *depthstencil_cso;
+      void *rasterizer_cso;
+      struct st_fragment_program *stfp;
+      struct st_vertex_program *stvp;
+   } gen_mipmap;
+
    struct cso_cache *cache;
 };
 
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index fd7d8cefea..b4a21fd7e2 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -48,15 +48,6 @@
 
 
-static void *blend_cso = NULL;
-static void *depthstencil_cso = NULL;
-static void *rasterizer_cso = NULL;
-
-static struct st_fragment_program *stfp = NULL;
-static struct st_vertex_program *stvp = NULL;
-
-
-
 static struct st_fragment_program *
 make_tex_fragment_program(GLcontext *ctx)
 {
@@ -119,20 +110,18 @@ st_init_generate_mipmap(struct st_context *st)
    struct pipe_rasterizer_state rasterizer;
    struct pipe_depth_stencil_alpha_state depthstencil;
 
-   assert(!blend_cso);
-
    memset(&blend, 0, sizeof(blend));
    blend.colormask = PIPE_MASK_RGBA;
-   blend_cso = pipe->create_blend_state(pipe, &blend);
+   st->gen_mipmap.blend_cso = pipe->create_blend_state(pipe, &blend);
 
    memset(&depthstencil, 0, sizeof(depthstencil));
-   depthstencil_cso = pipe->create_depth_stencil_alpha_state(pipe, &depthstencil);
+   st->gen_mipmap.depthstencil_cso = pipe->create_depth_stencil_alpha_state(pipe, &depthstencil);
 
    memset(&rasterizer, 0, sizeof(rasterizer));
-   rasterizer_cso = pipe->create_rasterizer_state(pipe, &rasterizer);
+   st->gen_mipmap.rasterizer_cso = pipe->create_rasterizer_state(pipe, &rasterizer);
 
-   stfp = make_tex_fragment_program(st->ctx);
-   stvp = st_make_passthrough_vertex_shader(st, GL_FALSE);
+   st->gen_mipmap.stfp = make_tex_fragment_program(st->ctx);
+   st->gen_mipmap.stvp = st_make_passthrough_vertex_shader(st, GL_FALSE);
 }
 
 
@@ -141,15 +130,11 @@ st_destroy_generate_mipmpap(struct st_context *st)
 {
    struct pipe_context *pipe = st->pipe;
 
-   pipe->delete_blend_state(pipe, blend_cso);
-   pipe->delete_depth_stencil_alpha_state(pipe, depthstencil_cso);
-   pipe->delete_rasterizer_state(pipe, rasterizer_cso);
+   pipe->delete_blend_state(pipe, st->gen_mipmap.blend_cso);
+   pipe->delete_depth_stencil_alpha_state(pipe, st->gen_mipmap.depthstencil_cso);
+   pipe->delete_rasterizer_state(pipe, st->gen_mipmap.rasterizer_cso);
 
    /* XXX free stfp, stvp */
-
-   blend_cso = NULL;
-   depthstencil_cso = NULL;
-   rasterizer_cso = NULL;
 }
 
 
@@ -263,13 +248,13 @@ st_render_mipmap(struct st_context *st,
 
 
    /* bind CSOs */
-   pipe->bind_blend_state(pipe, blend_cso);
-   pipe->bind_depth_stencil_alpha_state(pipe, depthstencil_cso);
-   pipe->bind_rasterizer_state(pipe, rasterizer_cso);
+   pipe->bind_blend_state(pipe, st->gen_mipmap.blend_cso);
+   pipe->bind_depth_stencil_alpha_state(pipe, st->gen_mipmap.depthstencil_cso);
+   pipe->bind_rasterizer_state(pipe, st->gen_mipmap.rasterizer_cso);
 
    /* bind shaders */
-   pipe->bind_fs_state(pipe, stfp->fs->data);
-   pipe->bind_vs_state(pipe, stvp->cso->data);
+   pipe->bind_fs_state(pipe, st->gen_mipmap.stfp->fs->data);
+   pipe->bind_vs_state(pipe, st->gen_mipmap.stvp->cso->data);
 
    /*
     * XXX for small mipmap levels, it may be faster to use the software
-- 
cgit v1.2.3


From bbdbdaddb0b476ec347c100e20469b0c52c5d525 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 16:10:11 -0700
Subject: gallium: rename st_fragment_program's fs field to cso to match
 st_vertex_program

---
 src/mesa/state_tracker/st_atom_shader.c   | 8 ++++----
 src/mesa/state_tracker/st_cb_clear.c      | 2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c | 2 +-
 src/mesa/state_tracker/st_cb_program.c    | 4 ++--
 src/mesa/state_tracker/st_gen_mipmap.c    | 2 +-
 src/mesa/state_tracker/st_program.c       | 2 +-
 src/mesa/state_tracker/st_program.h       | 2 +-
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 9196918509..2c6ec8421b 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -158,7 +158,7 @@ find_translated_vp(struct st_context *st,
    /*
     * Translate fragment program if needed.
     */
-   if (!stfp->fs) {
+   if (!stfp->cso) {
       GLuint inAttr, numIn = 0;
 
       for (inAttr = 0; inAttr < FRAG_ATTRIB_MAX; inAttr++) {
@@ -179,7 +179,7 @@ find_translated_vp(struct st_context *st,
                                            stfp->input_to_slot,
                                            stfp->tokens,
                                            ST_MAX_SHADER_TOKENS);
-      assert(stfp->fs);
+      assert(stfp->cso);
    }
 
 
@@ -227,7 +227,7 @@ find_translated_vp(struct st_context *st,
             if (fpInAttrib >= 0) {
                GLuint fpInSlot = stfp->input_to_slot[fpInAttrib];
                if (fpInSlot != ~0) {
-                  GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
+                  GLuint vpOutSlot = stfp->cso->state.input_map[fpInSlot];
                   xvp->output_to_slot[outAttr] = vpOutSlot;
                   numVpOuts++;
                }
@@ -300,7 +300,7 @@ update_linkage( struct st_context *st )
    st->pipe->bind_vs_state(st->pipe, st->state.vs->cso->data);
 
    st->fp = stfp;
-   st->state.fs = stfp->fs;
+   st->state.fs = stfp->cso;
    st->pipe->bind_fs_state(st->pipe, st->state.fs->data);
 
    st->vertex_result_to_slot = xvp->output_to_slot;
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index ab98b54bab..410062e1e8 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -341,7 +341,7 @@ clear_with_quad(GLcontext *ctx,
       if (!stfp) {
          stfp = make_frag_shader(st);
       }
-      pipe->bind_fs_state(pipe, stfp->fs->data);
+      pipe->bind_fs_state(pipe, stfp->cso->data);
    }
 
    /* vertex shader state: color/position pass-through */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 475e23653e..3245a7488b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -665,7 +665,7 @@ draw_textured_quad(GLcontext *ctx, GLint x, GLint y, GLfloat z,
    }
 
    /* fragment shader state: TEX lookup program */
-   pipe->bind_fs_state(pipe, stfp->fs->data);
+   pipe->bind_fs_state(pipe, stfp->cso->data);
 
    /* vertex shader state: position + texcoord pass-through */
    pipe->bind_vs_state(pipe, stvp->cso->data);
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index f1f33fb0dd..af3ee65504 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -168,10 +168,10 @@ static void st_program_string_notify( GLcontext *ctx,
 
       stfp->serialNo++;
 
-      if (stfp->fs) {
+      if (stfp->cso) {
          /* free the TGSI code */
          // cso_delete(stfp->vs);
-         stfp->fs = NULL;
+         stfp->cso = NULL;
       }
 
       stfp->param_state = stfp->Base.Base.Parameters->StateFlags;
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index b4a21fd7e2..459941cca8 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -253,7 +253,7 @@ st_render_mipmap(struct st_context *st,
    pipe->bind_rasterizer_state(pipe, st->gen_mipmap.rasterizer_cso);
 
    /* bind shaders */
-   pipe->bind_fs_state(pipe, st->gen_mipmap.stfp->fs->data);
+   pipe->bind_fs_state(pipe, st->gen_mipmap.stfp->cso->data);
    pipe->bind_vs_state(pipe, st->gen_mipmap.stvp->cso->data);
 
    /*
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 84a9094001..c8297baded 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -420,7 +420,7 @@ st_translate_fragment_program(struct st_context *st,
    fs.tokens = tokensOut;
 
    cso = st_cached_fs_state(st, &fs);
-   stfp->fs = cso;
+   stfp->cso = cso;
 
    if (0)
       _mesa_print_program(&stfp->Base.Base);
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index de02c3185f..ea1dde4a7a 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -62,7 +62,7 @@ struct st_fragment_program
    struct tgsi_token tokens[ST_MAX_SHADER_TOKENS];
 
    /** Pointer to the corresponding cached shader */
-   const struct cso_fragment_shader *fs;
+   const struct cso_fragment_shader *cso;
 
    GLuint param_state;
 
-- 
cgit v1.2.3


From c0a22da1570b104fb6d2ee5e620906b01d194165 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 16:37:49 -0700
Subject: gallium: added null ptr check

---
 src/mesa/state_tracker/st_cb_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index fab9eafc7f..c6fe928d47 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1514,7 +1514,7 @@ st_finalize_texture(GLcontext *ctx,
 
          /* Need to import images in main memory or held in other textures.
           */
-         if (stObj->pt != stImage->pt) {
+         if (stImage && stObj->pt != stImage->pt) {
             copy_image_data_to_texture(ctx->st, stObj, level, stImage);
 	    *needFlush = GL_TRUE;
          }
-- 
cgit v1.2.3


From dee8e268f77d31e78fd76005d529ea3b61e41209 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 16:38:56 -0700
Subject: gallium: remove dead code

---
 src/mesa/state_tracker/st_cb_texture.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index c6fe928d47..0ea367549b 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1509,7 +1509,6 @@ st_finalize_texture(GLcontext *ctx,
       GLuint level;
       for (level = 0; level <= stObj->lastLevel; level++) {
          struct st_texture_image *stImage =
-            //st_texture_image(stObj->base.Image[face][level]);
             st_texture_image(stObj->base.Image[face][stObj->base.BaseLevel + level]);
 
          /* Need to import images in main memory or held in other textures.
-- 
cgit v1.2.3


From 2ef9df660c0ee06aa0ea13663d0706cc03fecbb7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 16:56:18 -0700
Subject: Fix broken test.

As-is, if the texture was too large for the target, an assertion would fail.
Now check proxy texture first and if it works, then test non-proxy target.
---
 progs/tests/arbnpot.c | 57 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/progs/tests/arbnpot.c b/progs/tests/arbnpot.c
index 8107717e27..05ba85dad9 100644
--- a/progs/tests/arbnpot.c
+++ b/progs/tests/arbnpot.c
@@ -113,44 +113,67 @@ static void Init( void )
    minDim = imgWidth < imgHeight ? imgWidth : imgHeight;
 
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-   glTexImage1D(GL_TEXTURE_1D, 0, GL_RGB, imgWidth, 0,
-                imgFormat, GL_UNSIGNED_BYTE, image);
-   assert(glGetError() == GL_NO_ERROR);
 
+   /*
+    * 1D Texture.  Test proxy first, if that works, test non-proxy target.
+    */
    glTexImage1D(GL_PROXY_TEXTURE_1D, 0, GL_RGB, imgWidth, 0,
                 imgFormat, GL_UNSIGNED_BYTE, image);
    glGetTexLevelParameteriv(GL_PROXY_TEXTURE_1D, 0, GL_TEXTURE_WIDTH, &w);
-   assert(w == imgWidth);
+   assert(w == imgWidth || w == 0);
 
-   glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, imgWidth, imgHeight, 0,
-                imgFormat, GL_UNSIGNED_BYTE, image);
-   assert(glGetError() == GL_NO_ERROR);
+   if (w) {
+      glTexImage1D(GL_TEXTURE_1D, 0, GL_RGB, imgWidth, 0,
+                   imgFormat, GL_UNSIGNED_BYTE, image);
+      assert(glGetError() == GL_NO_ERROR);
+   }
 
+
+   /*
+    * 2D Texture
+    */
    glTexImage2D(GL_PROXY_TEXTURE_2D, 0, GL_RGB, imgWidth, imgHeight, 0,
                 imgFormat, GL_UNSIGNED_BYTE, image);
    glGetTexLevelParameteriv(GL_PROXY_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &w);
-   assert(w == imgWidth);
+   assert(w == imgWidth || w == 0);
 
-   glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, imgWidth, imgHeight, 1, 0,
-                imgFormat, GL_UNSIGNED_BYTE, image);
-   assert(glGetError() == GL_NO_ERROR);
+   if (w) {
+      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, imgWidth, imgHeight, 0,
+                   imgFormat, GL_UNSIGNED_BYTE, image);
+      assert(glGetError() == GL_NO_ERROR);
+   }
 
+
+   /*
+    * 3D Texture
+    */
    glTexImage3D(GL_PROXY_TEXTURE_3D, 0, GL_RGB, imgWidth, imgHeight, 1, 0,
                 imgFormat, GL_UNSIGNED_BYTE, image);
    glGetTexLevelParameteriv(GL_PROXY_TEXTURE_3D, 0, GL_TEXTURE_WIDTH, &w);
-   assert(w == imgWidth);
+   assert(w == imgWidth || w == 0);
+
+   if (w) {
+      glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, imgWidth, imgHeight, 1, 0,
+                   imgFormat, GL_UNSIGNED_BYTE, image);
+      assert(glGetError() == GL_NO_ERROR);
+   }
 
-   glTexImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X, 0, GL_RGB,
-                minDim, minDim, 0,
-                imgFormat, GL_UNSIGNED_BYTE, image);
-   assert(glGetError() == GL_NO_ERROR);
 
+   /*
+    * Cube Texture
+    */
    glTexImage2D(GL_PROXY_TEXTURE_CUBE_MAP, 0, GL_RGB,
                 minDim, minDim, 0,
                 imgFormat, GL_UNSIGNED_BYTE, image);
    glGetTexLevelParameteriv(GL_PROXY_TEXTURE_CUBE_MAP, 0, GL_TEXTURE_WIDTH, &w);
-   assert(w == minDim);
+   assert(w == minDim || w == 0);
 
+   if (w) {
+      glTexImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X, 0, GL_RGB,
+                   minDim, minDim, 0,
+                   imgFormat, GL_UNSIGNED_BYTE, image);
+      assert(glGetError() == GL_NO_ERROR);
+   }
 
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-- 
cgit v1.2.3


From b487e4b4fb3d9304feedf910a2f137703a0e260b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 17:07:27 -0700
Subject: gallium: clamp min_lod so it's never negative

---
 src/mesa/state_tracker/st_atom_sampler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 6241e70b55..92263cb688 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -147,7 +147,7 @@ update_samplers(struct st_context *st)
 
          sampler.lod_bias = st->ctx->Texture.Unit[su].LodBias;
 #if 1
-         sampler.min_lod = texobj->MinLod;
+         sampler.min_lod = (texobj->MinLod) < 0.0 ? 0.0 : texobj->MinLod;
          sampler.max_lod = texobj->MaxLod;
 #else
          /* min/max lod should really be as follows (untested).
-- 
cgit v1.2.3


From 4ac85794b181fdc44f1d9727926c89c084ebb769 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 17:44:57 -0700
Subject: gallium: fix/finish glReadPixels(GL_DEPTH_COMPONENT).

This fixes demos/reflect ('d' key) and tests/zreaddraw.c
---
 src/mesa/state_tracker/st_cb_readpixels.c | 99 ++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 21 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index a1bbb3a831..868c5f3c5f 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -180,20 +180,13 @@ st_readpixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    if (!strb)
       return;
 
+
    if (format == GL_RGBA && type == GL_FLOAT) {
       /* write tile(row) directly into user's buffer */
       df = (GLfloat *) _mesa_image_address2d(&clippedPacking, dest, width,
                                              height, format, type, 0, 0);
       dfStride = width * 4;
    }
-#if 0
-   else if (format == GL_DEPTH_COMPONENT && type == GL_FLOAT) {
-      /* write tile(row) directly into user's buffer */
-      df = (GLfloat *) _mesa_image_address2d(&clippedPacking, dest, width,
-                                             height, format, type, 0, 0);
-      dfStride = width;
-   }
-#endif
    else {
       /* write tile(row) into temp row buffer */
       df = (GLfloat *) temp;
@@ -209,22 +202,86 @@ st_readpixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
       yStep = 1;
    }
 
-   /* Do a row at a time to flip image data vertically */
-   for (i = 0; i < height; i++) {
-      pipe_get_tile_rgba(pipe, strb->surface, x, y, width, 1, df);
-      y += yStep;
-      df += dfStride;
-      if (!dfStride) {
-         /* convert GLfloat to user's format/type */
-         GLvoid *dst = _mesa_image_address2d(&clippedPacking, dest, width,
-                                             height, format, type, i, 0);
+   /*
+    * Copy pixels from pipe_surface to user memory
+    */
+   {
+      /* dest of first pixel in client memory */
+      GLubyte *dst = _mesa_image_address2d(&clippedPacking, dest, width,
+                                           height, format, type, 0, 0);
+      /* dest row stride */
+      const GLint dstStride = _mesa_image_row_stride(&clippedPacking, width,
+                                                     format, type);
+
+      if (strb->surface->format == PIPE_FORMAT_S8Z24_UNORM) {
          if (format == GL_DEPTH_COMPONENT) {
-            _mesa_pack_depth_span(ctx, width, dst, type,
-                                  (GLfloat *) temp, &clippedPacking);
+            for (i = 0; i < height; i++) {
+               GLuint ztemp[MAX_WIDTH], j;
+               GLfloat zfloat[MAX_WIDTH];
+               const double scale = 1.0 / ((1 << 24) - 1);
+               pipe_get_tile_raw(pipe, strb->surface, x, y,
+                                 width, 1, ztemp, 0);
+               y += yStep;
+               for (j = 0; j < width; j++) {
+                  zfloat[j] = (float) (scale * (ztemp[j] & 0xffffff));
+               }
+               _mesa_pack_depth_span(ctx, width, dst, type,
+                                     zfloat, &clippedPacking);
+               dst += dstStride;
+            }
          }
          else {
-            _mesa_pack_rgba_span_float(ctx, width, temp, format, type, dst,
-                                       &clippedPacking, transferOps);
+            /* untested, but simple: */
+            assert(format == GL_DEPTH_STENCIL_EXT);
+            for (i = 0; i < height; i++) {
+               pipe_get_tile_raw(pipe, strb->surface, x, y, width, 1, dst, 0);
+               y += yStep;
+               dst += dstStride;
+            }
+         }
+      }
+      else if (strb->surface->format == PIPE_FORMAT_Z16_UNORM) {
+         for (i = 0; i < height; i++) {
+            GLshort ztemp[MAX_WIDTH], j;
+            GLfloat zfloat[MAX_WIDTH];
+            const double scale = 1.0 / 0xffff;
+            pipe_get_tile_raw(pipe, strb->surface, x, y, width, 1, ztemp, 0);
+            y += yStep;
+            for (j = 0; j < width; j++) {
+               zfloat[j] = (float) (scale * ztemp[j]);
+            }
+            _mesa_pack_depth_span(ctx, width, dst, type,
+                                  zfloat, &clippedPacking);
+            dst += dstStride;
+         }
+      }
+      else if (strb->surface->format == PIPE_FORMAT_Z32_UNORM) {
+         for (i = 0; i < height; i++) {
+            GLuint ztemp[MAX_WIDTH], j;
+            GLfloat zfloat[MAX_WIDTH];
+            const double scale = 1.0 / 0xffffffff;
+            pipe_get_tile_raw(pipe, strb->surface, x, y, width, 1, ztemp, 0);
+            y += yStep;
+            for (j = 0; j < width; j++) {
+               zfloat[j] = (float) (scale * ztemp[j]);
+            }
+            _mesa_pack_depth_span(ctx, width, dst, type,
+                                  zfloat, &clippedPacking);
+            dst += dstStride;
+         }
+      }
+      else {
+         /* RGBA format */
+         /* Do a row at a time to flip image data vertically */
+         for (i = 0; i < height; i++) {
+            pipe_get_tile_rgba(pipe, strb->surface, x, y, width, 1, df);
+            y += yStep;
+            df += dfStride;
+            if (!dfStride) {
+               _mesa_pack_rgba_span_float(ctx, width, temp, format, type, dst,
+                                          &clippedPacking, transferOps);
+               dst += dstStride;
+            }
          }
       }
    }
-- 
cgit v1.2.3


From 67a483909f8999de1e0c40229b94f7dabab7403b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 12 Feb 2008 19:14:58 -0700
Subject: gallium: initial code for wide/AA line drawing

---
 src/mesa/pipe/draw/draw_wide_prims.c | 75 +++++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/src/mesa/pipe/draw/draw_wide_prims.c b/src/mesa/pipe/draw/draw_wide_prims.c
index 9759e7e2e8..4c7e279b20 100644
--- a/src/mesa/pipe/draw/draw_wide_prims.c
+++ b/src/mesa/pipe/draw/draw_wide_prims.c
@@ -76,7 +76,6 @@ static void passthrough_tri( struct draw_stage *stage,
 
 /**
  * Draw a wide line by drawing a quad (two triangles).
- * XXX still need line stipple.
  * XXX need to disable polygon stipple.
  */
 static void wide_line( struct draw_stage *stage,
@@ -103,12 +102,9 @@ static void wide_line( struct draw_stage *stage,
    /*
     * Draw wide line as a quad (two tris) by "stretching" the line along
     * X or Y.
-    * XXX For AA lines, the quad corners have to be computed in a
-    * more sophisticated way.
+    * We need to tweak coords in several ways to be conformant here.
     */
 
-   /* need to tweak coords in several ways to be conformant here */
-
    if (dx > dy) {
       /* x-major line */
       pos0[1] = pos0[1] - half_width - 0.25f;
@@ -165,6 +161,70 @@ static void wide_line( struct draw_stage *stage,
 }
 
 
+/**
+ * Draw a wide line by drawing a quad, using geometry which will
+ * fullfill GL's antialiased line requirements.
+ */
+static void wide_line_aa(struct draw_stage *stage,
+                         struct prim_header *header)
+{
+   const struct wide_stage *wide = wide_stage(stage);
+   const float half_width = wide->half_line_width;
+   struct prim_header tri;
+   struct vertex_header *v[4];
+   float *pos;
+   float dx = header->v[1]->data[0][0] - header->v[0]->data[0][0];
+   float dy = header->v[1]->data[0][1] - header->v[0]->data[0][1];
+   const float len = sqrt(dx * dx + dy * dy);
+   uint i;
+
+   dx = dx * half_width / len;
+   dy = dy * half_width / len;
+
+   /* allocate/dup new verts */
+   for (i = 0; i < 4; i++) {
+      v[i] = dup_vert(stage, header->v[i/2], i);
+   }
+
+   /*
+    * Quad for line from v0 to v1:
+    *
+    *  1                         3
+    *  +-------------------------+
+    *  |                         |
+    *  *v0                     v1*
+    *  |                         |
+    *  +-------------------------+
+    *  0                         2
+    */
+
+   pos = v[0]->data[0];
+   pos[0] += dy;
+   pos[1] -= dx;
+
+   pos = v[1]->data[0];
+   pos[0] -= dy;
+   pos[1] += dx;
+
+   pos = v[2]->data[0];
+   pos[0] += dy;
+   pos[1] -= dx;
+
+   pos = v[3]->data[0];
+   pos[0] -= dy;
+   pos[1] += dx;
+
+   tri.det = header->det;  /* only the sign matters */
+
+   tri.v[0] = v[2];  tri.v[1] = v[1];  tri.v[2] = v[0];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[3];  tri.v[1] = v[1];  tri.v[2] = v[2];
+   stage->next->tri( stage->next, &tri );
+
+}
+
+
 /**
  * Set the vertex texcoords for sprite mode.
  * Coords may be left untouched or set to a right-side-up or upside-down
@@ -319,7 +379,10 @@ static void wide_first_line( struct draw_stage *stage,
    wide->half_line_width = 0.5f * draw->rasterizer->line_width;
 
    if (draw->rasterizer->line_width != 1.0) {
-      wide->stage.line = wide_line;
+      if (draw->rasterizer->line_smooth)
+         wide->stage.line = wide_line_aa;
+      else
+         wide->stage.line = wide_line;
    }
    else {
       wide->stage.line = passthrough_line;
-- 
cgit v1.2.3


From 7bd15fd271ecaad0446632dd4e2190930abac487 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Wed, 13 Feb 2008 14:27:32 +1100
Subject: nv40: remove use of pt->first_level

---
 src/mesa/pipe/nv40/nv40_fragtex.c | 3 +--
 src/mesa/pipe/nv40/nv40_miptree.c | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/mesa/pipe/nv40/nv40_fragtex.c b/src/mesa/pipe/nv40/nv40_fragtex.c
index d278ce1897..69c0fcb5e0 100644
--- a/src/mesa/pipe/nv40/nv40_fragtex.c
+++ b/src/mesa/pipe/nv40/nv40_fragtex.c
@@ -70,8 +70,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 
 	txf  = ps->fmt;
 	txf |= tf->format | 0x8000;
-	txf |= ((pt->last_level - pt->first_level + 1) <<
-		NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
 
 	if (1) /* XXX */
 		txf |= NV40TCL_TEX_FORMAT_NO_BORDER;
diff --git a/src/mesa/pipe/nv40/nv40_miptree.c b/src/mesa/pipe/nv40/nv40_miptree.c
index df70feaa05..1b3c27dc45 100644
--- a/src/mesa/pipe/nv40/nv40_miptree.c
+++ b/src/mesa/pipe/nv40/nv40_miptree.c
@@ -23,7 +23,7 @@ nv40_miptree_layout(struct nv40_miptree *nv40mt)
 		nr_faces = 1;
 	}
 	
-	for (l = pt->first_level; l <= pt->last_level; l++) {
+	for (l = 0; l <= pt->last_level; l++) {
 		pt->width[l] = width;
 		pt->height[l] = height;
 		pt->depth[l] = depth;
@@ -44,7 +44,7 @@ nv40_miptree_layout(struct nv40_miptree *nv40mt)
 	}
 
 	for (f = 0; f < nr_faces; f++) {
-		for (l = pt->first_level; l <= pt->last_level; l++) {
+		for (l = 0; l <= pt->last_level; l++) {
 			nv40mt->level[l].image_offset[f] = offset;
 			offset += nv40mt->level[l].pitch * pt->height[l];
 		}
@@ -87,7 +87,7 @@ nv40_miptree_release(struct pipe_context *pipe, struct pipe_texture **pt)
 		int l;
 
 		pipe_buffer_reference(ws, &nv40mt->buffer, NULL);
-		for (l = mt->first_level; l <= mt->last_level; l++) {
+		for (l = 0; l <= mt->last_level; l++) {
 			if (nv40mt->level[l].image_offset)
 				free(nv40mt->level[l].image_offset);
 		}
-- 
cgit v1.2.3


From f33fa253c66241724fe4ae6943b091e0bc0409d6 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Wed, 13 Feb 2008 19:33:41 +1100
Subject: nv40: attempt at obeying sampler min_lod/max_lod/lod_bias

---
 src/mesa/pipe/nv40/nv40_context.c |  2 +-
 src/mesa/pipe/nv40/nv40_fragtex.c |  5 ++---
 src/mesa/pipe/nv40/nv40_state.c   | 14 ++++++++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/nv40/nv40_context.c b/src/mesa/pipe/nv40/nv40_context.c
index 302ad04c15..a8a2eaf215 100644
--- a/src/mesa/pipe/nv40/nv40_context.c
+++ b/src/mesa/pipe/nv40/nv40_context.c
@@ -74,7 +74,7 @@ nv40_get_paramf(struct pipe_context *pipe, int param)
 	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
 		return 16.0;
 	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
-		return 4.0;
+		return 16.0;
 	case PIPE_CAP_BITMAP_TEXCOORD_BIAS:
 		return 0.0;
 	default:
diff --git a/src/mesa/pipe/nv40/nv40_fragtex.c b/src/mesa/pipe/nv40/nv40_fragtex.c
index 69c0fcb5e0..5af5fbe746 100644
--- a/src/mesa/pipe/nv40/nv40_fragtex.c
+++ b/src/mesa/pipe/nv40/nv40_fragtex.c
@@ -108,10 +108,9 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
 		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
 	so_data  (so, ps->wrap);
-	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en |
-		   (0x00078000) /* mipmap related? */);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
 	so_data  (so, txs);
-	so_data  (so, ps->filt | 0x3fd6 /*voodoo*/);
+	so_data  (so, ps->filt | 0x2000 /*voodoo*/);
 	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
 		       pt->height[0]);
 	so_data  (so, ps->bcol);
diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c
index 80e94737ef..bcd244528d 100644
--- a/src/mesa/pipe/nv40/nv40_state.c
+++ b/src/mesa/pipe/nv40/nv40_state.c
@@ -193,6 +193,20 @@ nv40_sampler_state_create(struct pipe_context *pipe,
 
 	ps->filt = filter;
 
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+
+
 	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
 		switch (cso->compare_func) {
 		case PIPE_FUNC_NEVER:
-- 
cgit v1.2.3


From 4c239ec2d76a7cb1c4ce0a782a30639bb061705d Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Fri, 15 Feb 2008 00:14:15 +1100
Subject: nv40: always copy 4 float immediates for now.

---
 src/mesa/pipe/nv40/nv40_fragprog.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
index cb7b4a5e70..cc637f5dae 100644
--- a/src/mesa/pipe/nv40/nv40_fragprog.c
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -710,14 +710,15 @@ nv40_fragprog_translate(struct nv40_context *nv40,
 		{
 			struct tgsi_full_immediate *imm;
 			float vals[4];
-			int i;
 			
 			imm = &parse.FullToken.FullImmediate;
 			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
 			assert(fpc->nr_imm < MAX_IMM);
 
-			for (i = 0; i < (imm->Immediate.Size - 1); i++)
-				vals[i] = imm->u.ImmediateFloat32[i].Float;
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
 			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
 		}
 			break;
-- 
cgit v1.2.3


From 8d13f55d2c8483148f0a8786a50daf6e890690a3 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Fri, 15 Feb 2008 13:54:07 +1100
Subject: nv30: fix build

---
 src/mesa/pipe/nv30/nv30_fragtex.c | 2 +-
 src/mesa/pipe/nv30/nv30_miptree.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/pipe/nv30/nv30_fragtex.c b/src/mesa/pipe/nv30/nv30_fragtex.c
index e75b1f7f28..575f968cc5 100644
--- a/src/mesa/pipe/nv30/nv30_fragtex.c
+++ b/src/mesa/pipe/nv30/nv30_fragtex.c
@@ -95,7 +95,7 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 	}
 
 	txf  = tf->format << 8;
-	txf |= (pt->last_level - pt->first_level + 1) << 16;
+	txf |= (pt->last_level + 1) << 16;
 	txf |= log2i(pt->width[0]) << 20;
 	txf |= log2i(pt->height[0]) << 24;
 	txf |= log2i(pt->depth[0]) << 28;
diff --git a/src/mesa/pipe/nv30/nv30_miptree.c b/src/mesa/pipe/nv30/nv30_miptree.c
index 75e9b993c1..475f1be8ad 100644
--- a/src/mesa/pipe/nv30/nv30_miptree.c
+++ b/src/mesa/pipe/nv30/nv30_miptree.c
@@ -23,7 +23,7 @@ nv30_miptree_layout(struct nv30_miptree *nv30mt)
 		nr_faces = 1;
 	}
 	
-	for (l = pt->first_level; l <= pt->last_level; l++) {
+	for (l = 0; l <= pt->last_level; l++) {
 		pt->width[l] = width;
 		pt->height[l] = height;
 		pt->depth[l] = depth;
@@ -44,7 +44,7 @@ nv30_miptree_layout(struct nv30_miptree *nv30mt)
 	}
 
 	for (f = 0; f < nr_faces; f++) {
-		for (l = pt->first_level; l <= pt->last_level; l++) {
+		for (l = 0; l <= pt->last_level; l++) {
 			nv30mt->level[l].image_offset[f] = offset;
 			offset += nv30mt->level[l].pitch * pt->height[l];
 		}
@@ -88,7 +88,7 @@ nv30_miptree_release(struct pipe_context *pipe, struct pipe_texture **pt)
 		int l;
 
 		pipe_buffer_reference(ws, &nv30mt->buffer, NULL);
-		for (l = mt->first_level; l <= mt->last_level; l++) {
+		for (l = 0; l <= mt->last_level; l++) {
 			if (nv30mt->level[l].image_offset)
 				free(nv30mt->level[l].image_offset);
 		}
-- 
cgit v1.2.3


From 86bba420231766a908c6b204b0df036f6a5bf08d Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Fri, 15 Feb 2008 13:55:55 +1100
Subject: nv30: more interface updates

---
 src/mesa/pipe/nv30/nv30_dma.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mesa/pipe/nv30/nv30_dma.h b/src/mesa/pipe/nv30/nv30_dma.h
index 6eff6b4290..f8bc6eca76 100644
--- a/src/mesa/pipe/nv30/nv30_dma.h
+++ b/src/mesa/pipe/nv30/nv30_dma.h
@@ -35,10 +35,9 @@
 
 #define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
 	nv30->nvws->push_reloc(nv30->nvws->channel,                            \
-			       nv30->nvws->channel->pushbuf->cur,              \
+			       nv30->nvws->channel->pushbuf->cur++,            \
 			       (struct nouveau_bo *)(bo),                      \
 			       (data), (flags), (vor), (tor));                 \
-	OUT_RING(0);                                                           \
 } while(0)
 
 /* Raw data + flags depending on FB/TT buffer */
-- 
cgit v1.2.3


From 01fccea190cf07b41a675e95d82af8ab5228b176 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Fri, 15 Feb 2008 03:48:32 +0100
Subject: nouveau: nv30 fixes.

---
 src/mesa/pipe/nv30/nv30_context.c  | 2 +-
 src/mesa/pipe/nv30/nv30_state.c    | 5 ++++-
 src/mesa/pipe/nv30/nv30_vertprog.c | 8 +++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/mesa/pipe/nv30/nv30_context.c b/src/mesa/pipe/nv30/nv30_context.c
index c56f918ad9..eef49fbcc2 100644
--- a/src/mesa/pipe/nv30/nv30_context.c
+++ b/src/mesa/pipe/nv30/nv30_context.c
@@ -255,7 +255,7 @@ nv30_init_hwctx(struct nv30_context *nv30, int rankine_class)
 	int w=4096;
 	int h=4096;
 	int pitch=4096*4;
-	BEGIN_RING(rankine, NV34TCL_VIEWPORT_HORIZ, 5);
+	BEGIN_RING(rankine, NV34TCL_RT_HORIZ, 5);
 	OUT_RING  (w<<16);
 	OUT_RING  (h<<16);
 	OUT_RING  (0x148); /* format */
diff --git a/src/mesa/pipe/nv30/nv30_state.c b/src/mesa/pipe/nv30/nv30_state.c
index c29a644809..abc22eacae 100644
--- a/src/mesa/pipe/nv30/nv30_state.c
+++ b/src/mesa/pipe/nv30/nv30_state.c
@@ -604,7 +604,7 @@ nv30_set_framebuffer_state(struct pipe_context *pipe,
 
 	if (rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
 		BEGIN_RING(rankine, NV34TCL_COLOR0_PITCH, 1);
-		OUT_RING  (rt[0]->pitch * rt[0]->cpp);
+		OUT_RING  ( (rt[0]->pitch * rt[0]->cpp) | ( (zeta->pitch * zeta->cpp) << 16) );
 		nv30->rt[0] = rt[0]->buffer;
 	}
 
@@ -615,6 +615,9 @@ nv30_set_framebuffer_state(struct pipe_context *pipe,
 	}
 
 	if (zeta_format) {
+		/* XXX allocate LMA */
+/*		BEGIN_RING(rankine, NV34TCL_LMA_DEPTH_OFFSET, 1);
+		OUT_RING(0);*/
 		BEGIN_RING(rankine, NV34TCL_ZETA_PITCH, 1);
 		OUT_RING  (zeta->pitch * zeta->cpp);
 		nv30->zeta = zeta->buffer;
diff --git a/src/mesa/pipe/nv30/nv30_vertprog.c b/src/mesa/pipe/nv30/nv30_vertprog.c
index b712049fa7..41957b67c4 100644
--- a/src/mesa/pipe/nv30/nv30_vertprog.c
+++ b/src/mesa/pipe/nv30/nv30_vertprog.c
@@ -751,7 +751,7 @@ nv30_vertprog_bind(struct nv30_context *nv30, struct nv30_vertex_program *vp)
 		}
 #endif
 		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
-		OUT_RING  (vp->exec->start);
+		OUT_RING  (/*vp->exec->start*/0);
 		for (i = 0; i < vp->nr_insns; i++) {
 			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
 			OUT_RINGp (vp->insns[i].data, 4);
@@ -759,10 +759,8 @@ nv30_vertprog_bind(struct nv30_context *nv30, struct nv30_vertex_program *vp)
 	}
 
 	BEGIN_RING(rankine, NV34TCL_VP_START_FROM_ID, 1);
-	OUT_RING  (vp->exec->start);
-	BEGIN_RING(rankine, NV34TCL_VP_ATTRIB_EN, 2);
-	OUT_RING  (vp->ir);
-	OUT_RING  (vp->or);
+//	OUT_RING  (vp->exec->start);
+	OUT_RING  (0);
 
 	nv30->vertprog.active = vp;
 }
-- 
cgit v1.2.3


From 0a653bef05fb3627fdd1857bfa8c3a1ebe08a4b7 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Fri, 15 Feb 2008 04:23:46 +0100
Subject: nouveau: more nv30 fixes, still doesn't work as well as before.

---
 configs/linux-dri                 | 2 +-
 src/mesa/pipe/nv30/nv30_context.c | 2 ++
 src/mesa/pipe/nv30/nv30_state.c   | 8 ++------
 src/mesa/pipe/nv30/nv30_vbo.c     | 8 ++++----
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/configs/linux-dri b/configs/linux-dri
index 494b0aab8e..c7eb7112c0 100644
--- a/configs/linux-dri
+++ b/configs/linux-dri
@@ -66,4 +66,4 @@ WINDOW_SYSTEM=dri
 
 # gamma are missing because they have not been converted to use the new
 # interface.
-DRI_DIRS = intel_winsys nouveau_winsys
+DRI_DIRS = nouveau_winsys
diff --git a/src/mesa/pipe/nv30/nv30_context.c b/src/mesa/pipe/nv30/nv30_context.c
index eef49fbcc2..d12aab85d8 100644
--- a/src/mesa/pipe/nv30/nv30_context.c
+++ b/src/mesa/pipe/nv30/nv30_context.c
@@ -71,6 +71,8 @@ nv30_get_paramf(struct pipe_context *pipe, int param)
 		return 16.0;
 	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
 		return 4.0;
+	case PIPE_CAP_BITMAP_TEXCOORD_BIAS:
+		return 0.0;
 	default:
 		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
 		return 0.0;
diff --git a/src/mesa/pipe/nv30/nv30_state.c b/src/mesa/pipe/nv30/nv30_state.c
index abc22eacae..992afe033e 100644
--- a/src/mesa/pipe/nv30/nv30_state.c
+++ b/src/mesa/pipe/nv30/nv30_state.c
@@ -614,12 +614,8 @@ nv30_set_framebuffer_state(struct pipe_context *pipe,
 		nv30->rt[1] = rt[1]->buffer;
 	}
 
-	if (zeta_format) {
-		/* XXX allocate LMA */
-/*		BEGIN_RING(rankine, NV34TCL_LMA_DEPTH_OFFSET, 1);
-		OUT_RING(0);*/
-		BEGIN_RING(rankine, NV34TCL_ZETA_PITCH, 1);
-		OUT_RING  (zeta->pitch * zeta->cpp);
+	if (zeta_format)
+	{
 		nv30->zeta = zeta->buffer;
 	}
 
diff --git a/src/mesa/pipe/nv30/nv30_vbo.c b/src/mesa/pipe/nv30/nv30_vbo.c
index e6c50d3820..173a6e8fd7 100644
--- a/src/mesa/pipe/nv30/nv30_vbo.c
+++ b/src/mesa/pipe/nv30/nv30_vbo.c
@@ -241,9 +241,9 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 	}
 
 	while (count) {
-		push = MIN2(count, 2046);
+		push = MIN2(count, 2047 * 2);
 
-		BEGIN_RING_NI(rankine, NV40TCL_VB_ELEMENT_U16, push);
+		BEGIN_RING_NI(rankine, NV40TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
 			OUT_RING((elts[i+1] << 16) | elts[i]);
 
@@ -266,9 +266,9 @@ nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 	}
 
 	while (count) {
-		push = MIN2(count, 2046);
+		push = MIN2(count, 2047 * 2);
 
-		BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push);
+		BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
 			OUT_RING((elts[i+1] << 16) | elts[i]);
 
-- 
cgit v1.2.3


From e82eabaf5e6cb91984476a991ec24e8105989dc4 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sat, 16 Feb 2008 02:30:56 +1100
Subject: nv40: fix non-debug builds + start on obeying portability guidelines.

---
 src/mesa/pipe/nouveau/nouveau_stateobj.h | 22 ++++++++++--------
 src/mesa/pipe/nv40/nv40_context.c        |  4 ++--
 src/mesa/pipe/nv40/nv40_context.h        |  2 +-
 src/mesa/pipe/nv40/nv40_fragprog.c       |  2 +-
 src/mesa/pipe/nv40/nv40_miptree.c        |  4 ++--
 src/mesa/pipe/nv40/nv40_query.c          |  4 ++--
 src/mesa/pipe/nv40/nv40_state.c          |  6 ++---
 src/mesa/pipe/nv40/nv40_vbo.c            | 39 ++++++++++++++++++++++++--------
 src/mesa/pipe/nv40/nv40_vertprog.c       |  4 ++--
 9 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/src/mesa/pipe/nouveau/nouveau_stateobj.h b/src/mesa/pipe/nouveau/nouveau_stateobj.h
index 58167a24de..07c31b014a 100644
--- a/src/mesa/pipe/nouveau/nouveau_stateobj.h
+++ b/src/mesa/pipe/nouveau/nouveau_stateobj.h
@@ -1,6 +1,8 @@
 #ifndef __NOUVEAU_STATEOBJ_H__
 #define __NOUVEAU_STATEOBJ_H__
 
+#include "pipe/p_util.h"
+
 struct nouveau_stateobj_reloc {
 	struct pipe_buffer *bo;
 
@@ -24,15 +26,15 @@ struct nouveau_stateobj {
 	unsigned cur_reloc;
 };
 
-static inline struct nouveau_stateobj *
+static INLINE struct nouveau_stateobj *
 so_new(unsigned push, unsigned reloc)
 {
 	struct nouveau_stateobj *so;
 
-	so = malloc(sizeof(struct nouveau_stateobj));
+	so = MALLOC(sizeof(struct nouveau_stateobj));
 	so->refcount = 1;
-	so->push = malloc(sizeof(unsigned) * push);
-	so->reloc = malloc(sizeof(struct nouveau_stateobj_reloc) * reloc);
+	so->push = MALLOC(sizeof(unsigned) * push);
+	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
 
 	so->cur = so->push;
 	so->cur_reloc = so->cur_packet = 0;
@@ -40,7 +42,7 @@ so_new(unsigned push, unsigned reloc)
 	return so;
 }
 
-static inline void
+static INLINE void
 so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
 {
 	struct nouveau_stateobj *so;
@@ -61,14 +63,14 @@ so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
 	}
 }
 
-static inline void
+static INLINE void
 so_data(struct nouveau_stateobj *so, unsigned data)
 {
 	(*so->cur++) = (data);
 	so->cur_packet += 4;
 }
 
-static inline void
+static INLINE void
 so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 	  unsigned mthd, unsigned size)
 {
@@ -76,7 +78,7 @@ so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
 }
 
-static inline void
+static INLINE void
 so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
 	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
 {
@@ -92,7 +94,7 @@ so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
 	so_data(so, data);
 }
 
-static inline void
+static INLINE void
 so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
 {
 	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
@@ -113,7 +115,7 @@ so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
 	pb->cur += nr;
 }
 
-static inline void
+static INLINE void
 so_emit_reloc_markers(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
 {
 	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
diff --git a/src/mesa/pipe/nv40/nv40_context.c b/src/mesa/pipe/nv40/nv40_context.c
index a8a2eaf215..6e86ca0081 100644
--- a/src/mesa/pipe/nv40/nv40_context.c
+++ b/src/mesa/pipe/nv40/nv40_context.c
@@ -154,7 +154,7 @@ nv40_channel_init(struct pipe_winsys *ws, struct nouveau_winsys *nvws,
 		return NULL;
 	}
 
-	cnv40 = calloc(1, sizeof(struct nv40_channel_context));
+	cnv40 = CALLOC(1, sizeof(struct nv40_channel_context));
 	if (!cnv40)
 		return NULL;
 	cnv40->chipset = chipset;
@@ -274,7 +274,7 @@ nv40_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws,
 {
 	struct nv40_context *nv40;
 
-	nv40 = calloc(1, sizeof(struct nv40_context));
+	nv40 = CALLOC(1, sizeof(struct nv40_context));
 	if (!nv40)
 		return NULL;
 
diff --git a/src/mesa/pipe/nv40/nv40_context.h b/src/mesa/pipe/nv40/nv40_context.h
index d7c9ee7851..cf2a14405a 100644
--- a/src/mesa/pipe/nv40/nv40_context.h
+++ b/src/mesa/pipe/nv40/nv40_context.h
@@ -100,7 +100,7 @@ struct nv40_context {
 	struct pipe_vertex_element vtxelt[PIPE_ATTRIB_MAX];
 };
 
-static inline struct nv40_context *
+static INLINE struct nv40_context *
 nv40_context(struct pipe_context *pipe)
 {
 	return (struct nv40_context *)pipe;
diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c
index cc637f5dae..7487fb896f 100644
--- a/src/mesa/pipe/nv40/nv40_fragprog.c
+++ b/src/mesa/pipe/nv40/nv40_fragprog.c
@@ -675,7 +675,7 @@ nv40_fragprog_translate(struct nv40_context *nv40,
 	struct tgsi_parse_context parse;
 	struct nv40_fpc *fpc = NULL;
 
-	fpc = calloc(1, sizeof(struct nv40_fpc));
+	fpc = CALLOC(1, sizeof(struct nv40_fpc));
 	if (!fpc)
 		return;
 	fpc->fp = fp;
diff --git a/src/mesa/pipe/nv40/nv40_miptree.c b/src/mesa/pipe/nv40/nv40_miptree.c
index 1b3c27dc45..92e6b3a43d 100644
--- a/src/mesa/pipe/nv40/nv40_miptree.c
+++ b/src/mesa/pipe/nv40/nv40_miptree.c
@@ -35,7 +35,7 @@ nv40_miptree_layout(struct nv40_miptree *nv40mt)
 		nv40mt->level[l].pitch = (nv40mt->level[l].pitch + 63) & ~63;
 
 		nv40mt->level[l].image_offset =
-			calloc(nr_faces, sizeof(unsigned));
+			CALLOC(nr_faces, sizeof(unsigned));
 
 		width  = MAX2(1, width  >> 1);
 		height = MAX2(1, height >> 1);
@@ -59,7 +59,7 @@ nv40_miptree_create(struct pipe_context *pipe, const struct pipe_texture *pt)
 	struct pipe_winsys *ws = pipe->winsys;
 	struct nv40_miptree *mt;
 
-	mt = malloc(sizeof(struct nv40_miptree));
+	mt = MALLOC(sizeof(struct nv40_miptree));
 	if (!mt)
 		return NULL;
 	mt->base = *pt;
diff --git a/src/mesa/pipe/nv40/nv40_query.c b/src/mesa/pipe/nv40/nv40_query.c
index eb305e6444..8bca2788b9 100644
--- a/src/mesa/pipe/nv40/nv40_query.c
+++ b/src/mesa/pipe/nv40/nv40_query.c
@@ -9,7 +9,7 @@ struct nv40_query {
 	uint64_t result;
 };
 
-static inline struct nv40_query *
+static INLINE struct nv40_query *
 nv40_query(struct pipe_query *pipe)
 {
 	return (struct nv40_query *)pipe;
@@ -20,7 +20,7 @@ nv40_query_create(struct pipe_context *pipe, unsigned query_type)
 {
 	struct nv40_query *q;
 
-	q = calloc(1, sizeof(struct nv40_query));
+	q = CALLOC(1, sizeof(struct nv40_query));
 	q->type = query_type;
 
 	return (struct pipe_query *)q;
diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c
index bcd244528d..713f31dbb1 100644
--- a/src/mesa/pipe/nv40/nv40_state.c
+++ b/src/mesa/pipe/nv40/nv40_state.c
@@ -112,7 +112,7 @@ nv40_sampler_state_create(struct pipe_context *pipe,
 	struct nv40_sampler_state *ps;
 	uint32_t filter = 0;
 
-	ps = malloc(sizeof(struct nv40_sampler_state));
+	ps = MALLOC(sizeof(struct nv40_sampler_state));
 
 	ps->fmt = 0;
 	if (!cso->normalized_coords)
@@ -455,7 +455,7 @@ nv40_vp_state_create(struct pipe_context *pipe,
 {
 	struct nv40_vertex_program *vp;
 
-	vp = calloc(1, sizeof(struct nv40_vertex_program));
+	vp = CALLOC(1, sizeof(struct nv40_vertex_program));
 	vp->pipe = cso;
 
 	return (void *)vp;
@@ -487,7 +487,7 @@ nv40_fp_state_create(struct pipe_context *pipe,
 {
 	struct nv40_fragment_program *fp;
 
-	fp = calloc(1, sizeof(struct nv40_fragment_program));
+	fp = CALLOC(1, sizeof(struct nv40_fragment_program));
 	fp->pipe = cso;
 
 	return (void *)fp;
diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
index 4e9cdb4585..a18d0f9b25 100644
--- a/src/mesa/pipe/nv40/nv40_vbo.c
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -30,7 +30,8 @@ nv40_vbo_type(uint format)
 	case PIPE_FORMAT_TYPE_UNORM:
 		return NV40TCL_VTXFMT_TYPE_UBYTE;
 	default:
-		assert(0);
+		NOUVEAU_ERR("Unknown format 0x%08x\n", format);
+		return NV40TCL_VTXFMT_TYPE_FLOAT;
 	}
 }
 
@@ -188,8 +189,13 @@ nv40_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	unsigned nr;
+	boolean ret;
 
-	assert(nv40_vbo_validate_state(nv40, NULL, 0));
+	ret = nv40_vbo_validate_state(nv40, NULL, 0);
+	if (!ret) {
+		NOUVEAU_ERR("state validate failed\n");
+		return FALSE;
+	}
 
 	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -290,19 +296,26 @@ nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 }
 
 static boolean
-nv40_draw_elements_inline(struct pipe_context *pipe,
+nv40_draw_elements_INLINE(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct pipe_winsys *ws = pipe->winsys;
+	boolean ret;
 	void *map;
 
-	assert(nv40_vbo_validate_state(nv40, NULL, 0));
+	ret = nv40_vbo_validate_state(nv40, NULL, 0);
+	if (!ret) {
+		NOUVEAU_ERR("state validate failed\n");
+		return FALSE;
+	}
 
 	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
-	if (!ib)
-		assert(0);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
 
 	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -318,7 +331,7 @@ nv40_draw_elements_inline(struct pipe_context *pipe,
 		nv40_draw_elements_u32(nv40, map, start, count);
 		break;
 	default:
-		assert(0);
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
 		break;
 	}
 
@@ -337,6 +350,7 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	unsigned nr, type;
+	boolean ret;
 
 	switch (ib_size) {
 	case 2:
@@ -346,10 +360,15 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
 		break;
 	default:
-		assert(0);
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		return FALSE;
 	}
 
-	assert(nv40_vbo_validate_state(nv40, ib, type));
+	ret = nv40_vbo_validate_state(nv40, ib, type);
+	if (!ret) {
+		NOUVEAU_ERR("failed state validation\n");
+		return FALSE;
+	}
 
 	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -391,7 +410,7 @@ nv40_draw_elements(struct pipe_context *pipe,
 	 * to be support on any chipset for 8-bit indices.
 	 */
 	if (nv40->hw->curie->grclass == NV44TCL || indexSize == 1) {
-		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
+		nv40_draw_elements_INLINE(pipe, indexBuffer, indexSize,
 					  mode, start, count);
 	} else {
 		nv40_draw_elements_vbo(pipe, indexBuffer, indexSize,
diff --git a/src/mesa/pipe/nv40/nv40_vertprog.c b/src/mesa/pipe/nv40/nv40_vertprog.c
index 415b3c70c7..d57e3ca350 100644
--- a/src/mesa/pipe/nv40/nv40_vertprog.c
+++ b/src/mesa/pipe/nv40/nv40_vertprog.c
@@ -551,7 +551,7 @@ nv40_vertprog_prepare(struct nv40_vpc *vpc)
 	tgsi_parse_free(&p);
 
 	if (nr_imm) {
-		vpc->imm = calloc(nr_imm, sizeof(struct nv40_sreg));
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
 		assert(vpc->imm);
 	}
 
@@ -565,7 +565,7 @@ nv40_vertprog_translate(struct nv40_context *nv40,
 	struct tgsi_parse_context parse;
 	struct nv40_vpc *vpc = NULL;
 
-	vpc = calloc(1, sizeof(struct nv40_vpc));
+	vpc = CALLOC(1, sizeof(struct nv40_vpc));
 	if (!vpc)
 		return;
 	vpc->vp = vp;
-- 
cgit v1.2.3


From 6e3e5ba3bb25183efafcf78d6794fc50236c2835 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sat, 16 Feb 2008 02:42:06 +1100
Subject: nv40: over-zealous s/

---
 src/mesa/pipe/nv40/nv40_vbo.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/pipe/nv40/nv40_vbo.c b/src/mesa/pipe/nv40/nv40_vbo.c
index a18d0f9b25..6b1ac65b49 100644
--- a/src/mesa/pipe/nv40/nv40_vbo.c
+++ b/src/mesa/pipe/nv40/nv40_vbo.c
@@ -296,7 +296,7 @@ nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 }
 
 static boolean
-nv40_draw_elements_INLINE(struct pipe_context *pipe,
+nv40_draw_elements_inline(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
 {
@@ -410,7 +410,7 @@ nv40_draw_elements(struct pipe_context *pipe,
 	 * to be support on any chipset for 8-bit indices.
 	 */
 	if (nv40->hw->curie->grclass == NV44TCL || indexSize == 1) {
-		nv40_draw_elements_INLINE(pipe, indexBuffer, indexSize,
+		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
 					  mode, start, count);
 	} else {
 		nv40_draw_elements_vbo(pipe, indexBuffer, indexSize,
-- 
cgit v1.2.3


From 4032ff3889021089debce1c43a0bb984b121cbf6 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sat, 16 Feb 2008 02:54:11 +0100
Subject: nv30: fixes.

---
 src/mesa/pipe/nv30/nv30_context.h  |  2 +-
 src/mesa/pipe/nv30/nv30_dma.h      | 65 --------------------------------------
 src/mesa/pipe/nv30/nv30_fragprog.c |  5 +--
 src/mesa/pipe/nv30/nv30_fragtex.c  |  2 +-
 src/mesa/pipe/nv30/nv30_miptree.c  |  2 +-
 src/mesa/pipe/nv30/nv30_query.c    |  3 +-
 src/mesa/pipe/nv30/nv30_state.c    |  8 ++---
 src/mesa/pipe/nv30/nv30_vbo.c      | 39 +++++++++++++++++------
 src/mesa/pipe/nv30/nv30_vertprog.c |  5 +--
 9 files changed, 44 insertions(+), 87 deletions(-)
 delete mode 100644 src/mesa/pipe/nv30/nv30_dma.h

diff --git a/src/mesa/pipe/nv30/nv30_context.h b/src/mesa/pipe/nv30/nv30_context.h
index d2262c5065..f6c6954599 100644
--- a/src/mesa/pipe/nv30/nv30_context.h
+++ b/src/mesa/pipe/nv30/nv30_context.h
@@ -83,7 +83,7 @@ struct nv30_context {
 	struct pipe_vertex_element vtxelt[PIPE_ATTRIB_MAX];
 };
 
-static inline struct nv30_context *
+static INLINE struct nv30_context *
 nv30_context(struct pipe_context *pipe)
 {
 	return (struct nv30_context *)pipe;
diff --git a/src/mesa/pipe/nv30/nv30_dma.h b/src/mesa/pipe/nv30/nv30_dma.h
deleted file mode 100644
index f8bc6eca76..0000000000
--- a/src/mesa/pipe/nv30/nv30_dma.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef __NV30_DMA_H__
-#define __NV30_DMA_H__
-
-#include "pipe/nouveau/nouveau_winsys.h"
-
-#define OUT_RING(data) do {                                                    \
-	(*nv30->nvws->channel->pushbuf->cur++) = (data);                       \
-} while(0)
-
-#define OUT_RINGp(src,size) do {                                               \
-	memcpy(nv30->nvws->channel->pushbuf->cur, (src), (size) * 4);          \
-	nv30->nvws->channel->pushbuf->cur += (size);                           \
-} while(0)
-
-#define OUT_RINGf(data) do {                                                   \
-	union { float v; uint32_t u; } c;                                      \
-	c.v = (data);                                                          \
-	OUT_RING(c.u);                                                         \
-} while(0)
-
-#define BEGIN_RING(obj,mthd,size) do {                                         \
-	if (nv30->nvws->channel->pushbuf->remaining < ((size) + 1))            \
-		nv30->nvws->push_flush(nv30->nvws->channel, ((size) + 1));     \
-	OUT_RING((nv30->obj->subc << 13) | ((size) << 18) | (mthd));           \
-	nv30->nvws->channel->pushbuf->remaining -= ((size) + 1);               \
-} while(0)
-
-#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
-	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
-} while(0)
-
-#define FIRE_RING() do {                                                       \
-	nv30->nvws->push_flush(nv30->nvws->channel, 0);                        \
-} while(0)
-
-#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
-	nv30->nvws->push_reloc(nv30->nvws->channel,                            \
-			       nv30->nvws->channel->pushbuf->cur++,            \
-			       (struct nouveau_bo *)(bo),                      \
-			       (data), (flags), (vor), (tor));                 \
-} while(0)
-
-/* Raw data + flags depending on FB/TT buffer */
-#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
-	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
-} while(0)
-
-/* FB/TT object handle */
-#define OUT_RELOCo(bo,flags) do {                                              \
-	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
-		  nv30->nvws->channel->vram->handle,                           \
-		  nv30->nvws->channel->gart->handle);                          \
-} while(0)
-
-/* Low 32-bits of offset */
-#define OUT_RELOCl(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
-} while(0)
-
-/* High 32-bits of offset */
-#define OUT_RELOCh(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
-} while(0)
-
-#endif
diff --git a/src/mesa/pipe/nv30/nv30_fragprog.c b/src/mesa/pipe/nv30/nv30_fragprog.c
index 0233873d92..0db1ac868c 100644
--- a/src/mesa/pipe/nv30/nv30_fragprog.c
+++ b/src/mesa/pipe/nv30/nv30_fragprog.c
@@ -1,6 +1,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
+#include "pipe/p_util.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "pipe/tgsi/util/tgsi_parse.h"
@@ -675,7 +676,7 @@ nv30_fragprog_translate(struct nv30_context *nv30,
 	struct tgsi_parse_context parse;
 	struct nv30_fpc *fpc = NULL;
 
-	fpc = calloc(1, sizeof(struct nv30_fpc));
+	fpc = CALLOC(1, sizeof(struct nv30_fpc));
 	if (!fpc)
 		return;
 	fpc->fp = fp;
@@ -716,7 +717,7 @@ nv30_fragprog_translate(struct nv30_context *nv30,
 			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
 			assert(fpc->nr_imm < MAX_IMM);
 
-			for (i = 0; i < imm->Immediate.Size; i++)
+			for (i = 0; i < 4; i++)
 				vals[i] = imm->u.ImmediateFloat32[i].Float;
 			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
 		}
diff --git a/src/mesa/pipe/nv30/nv30_fragtex.c b/src/mesa/pipe/nv30/nv30_fragtex.c
index 575f968cc5..45ee6db8d6 100644
--- a/src/mesa/pipe/nv30/nv30_fragtex.c
+++ b/src/mesa/pipe/nv30/nv30_fragtex.c
@@ -1,6 +1,6 @@
 #include "nv30_context.h"
 
-static inline int log2i(int i)
+static INLINE int log2i(int i)
 {
 	int r = 0;
 
diff --git a/src/mesa/pipe/nv30/nv30_miptree.c b/src/mesa/pipe/nv30/nv30_miptree.c
index 475f1be8ad..5fb89f4cfd 100644
--- a/src/mesa/pipe/nv30/nv30_miptree.c
+++ b/src/mesa/pipe/nv30/nv30_miptree.c
@@ -35,7 +35,7 @@ nv30_miptree_layout(struct nv30_miptree *nv30mt)
 		nv30mt->level[l].pitch = (nv30mt->level[l].pitch + 63) & ~63;
 
 		nv30mt->level[l].image_offset =
-			calloc(nr_faces, sizeof(unsigned));
+			CALLOC(nr_faces, sizeof(unsigned));
 
 		width  = MAX2(1, width  >> 1);
 		height = MAX2(1, height >> 1);
diff --git a/src/mesa/pipe/nv30/nv30_query.c b/src/mesa/pipe/nv30/nv30_query.c
index ea74c0f5f1..71fdcfa24d 100644
--- a/src/mesa/pipe/nv30/nv30_query.c
+++ b/src/mesa/pipe/nv30/nv30_query.c
@@ -1,4 +1,5 @@
 #include "pipe/p_context.h"
+#include "pipe/p_util.h"
 
 #include "nv30_context.h"
 
@@ -20,7 +21,7 @@ nv30_query_create(struct pipe_context *pipe, unsigned query_type)
 {
 	struct nv30_query *q;
 
-	q = calloc(1, sizeof(struct nv30_query));
+	q = CALLOC(1, sizeof(struct nv30_query));
 	q->type = query_type;
 
 	return (struct pipe_query *)q;
diff --git a/src/mesa/pipe/nv30/nv30_state.c b/src/mesa/pipe/nv30/nv30_state.c
index 992afe033e..53368561e0 100644
--- a/src/mesa/pipe/nv30/nv30_state.c
+++ b/src/mesa/pipe/nv30/nv30_state.c
@@ -420,9 +420,9 @@ nv30_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
 
 	BEGIN_RING(rankine, NV34TCL_DEPTH_FUNC, 3);
 	OUT_RINGp ((uint32_t *)&hw->depth, 3);
-	BEGIN_RING(rankine, NV34TCL_STENCIL_FRONT_ENABLE, 16);
-	OUT_RINGp ((uint32_t *)&hw->stencil.front, 8);
+	BEGIN_RING(rankine, NV34TCL_STENCIL_BACK_ENABLE, 16);
 	OUT_RINGp ((uint32_t *)&hw->stencil.back, 8);
+	OUT_RINGp ((uint32_t *)&hw->stencil.front, 8);
 	BEGIN_RING(rankine, NV34TCL_ALPHA_FUNC_ENABLE, 3);
 	OUT_RINGp ((uint32_t *)&hw->alpha.enabled, 3);
 }
@@ -439,7 +439,7 @@ nv30_vp_state_create(struct pipe_context *pipe,
 {
 	struct nv30_vertex_program *vp;
 
-	vp = calloc(1, sizeof(struct nv30_vertex_program));
+	vp = CALLOC(1, sizeof(struct nv30_vertex_program));
 	vp->pipe = cso;
 
 	return (void *)vp;
@@ -471,7 +471,7 @@ nv30_fp_state_create(struct pipe_context *pipe,
 {
 	struct nv30_fragment_program *fp;
 
-	fp = calloc(1, sizeof(struct nv30_fragment_program));
+	fp = CALLOC(1, sizeof(struct nv30_fragment_program));
 	fp->pipe = cso;
 
 	return (void *)fp;
diff --git a/src/mesa/pipe/nv30/nv30_vbo.c b/src/mesa/pipe/nv30/nv30_vbo.c
index 173a6e8fd7..414cf55ac8 100644
--- a/src/mesa/pipe/nv30/nv30_vbo.c
+++ b/src/mesa/pipe/nv30/nv30_vbo.c
@@ -30,7 +30,8 @@ nv30_vbo_type(uint format)
 	case PIPE_FORMAT_TYPE_UNORM:
 		return NV34TCL_VERTEX_ARRAY_FORMAT_TYPE_UBYTE;
 	default:
-		assert(0);
+		NOUVEAU_ERR("Unknown format 0x%08x\n", format);
+		return NV40TCL_VTXFMT_TYPE_FLOAT;
 	}
 }
 
@@ -194,8 +195,13 @@ nv30_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	unsigned nr;
+	boolean ret;
 
-	assert(nv30_vbo_validate_state(nv30, NULL, 0));
+	ret = nv30_vbo_validate_state(nv30, NULL, 0);
+	if (!ret) {
+		NOUVEAU_ERR("state validate failed\n");
+		return FALSE;
+	}
 
 	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -302,13 +308,20 @@ nv30_draw_elements_inline(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct pipe_winsys *ws = pipe->winsys;
+	boolean ret;
 	void *map;
 
-	assert(nv30_vbo_validate_state(nv30, NULL, 0));
+	ret =  nv30_vbo_validate_state(nv30, NULL, 0);
+	if (!ret) {
+		NOUVEAU_ERR("state validate failed\n");
+		return FALSE;
+	}
 
 	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
-	if (!ib)
-		assert(0);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
 
 	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -324,7 +337,7 @@ nv30_draw_elements_inline(struct pipe_context *pipe,
 		nv30_draw_elements_u32(nv30, map, start, count);
 		break;
 	default:
-		assert(0);
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
 		break;
 	}
 
@@ -343,6 +356,7 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	unsigned nr, type;
+	boolean ret;
 
 	switch (ib_size) {
 	case 2:
@@ -352,10 +366,15 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
 		break;
 	default:
-		assert(0);
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		return FALSE;
 	}
 
-	assert(nv30_vbo_validate_state(nv30, ib, type));
+	ret = nv30_vbo_validate_state(nv30, ib, type);
+	if (!ret) {
+		NOUVEAU_ERR("failed state validation\n");
+		return FALSE;
+	}
 
 	BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -391,10 +410,10 @@ nv30_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
 {
-	if (indexSize != 1) {
+/*	if (indexSize != 1) {
 		nv30_draw_elements_vbo(pipe, indexBuffer, indexSize,
 				       mode, start, count);
-	} else {
+	} else */{
 		nv30_draw_elements_inline(pipe, indexBuffer, indexSize,
 					  mode, start, count);
 	}
diff --git a/src/mesa/pipe/nv30/nv30_vertprog.c b/src/mesa/pipe/nv30/nv30_vertprog.c
index 41957b67c4..c96210d3fa 100644
--- a/src/mesa/pipe/nv30/nv30_vertprog.c
+++ b/src/mesa/pipe/nv30/nv30_vertprog.c
@@ -1,6 +1,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
+#include "pipe/p_util.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "pipe/tgsi/util/tgsi_parse.h"
@@ -539,7 +540,7 @@ nv30_vertprog_prepare(struct nv30_vpc *vpc)
 	tgsi_parse_free(&p);
 
 	if (nr_imm) {
-		vpc->imm = calloc(nr_imm, sizeof(struct nv30_sreg));
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv30_sreg));
 		assert(vpc->imm);
 	}
 
@@ -553,7 +554,7 @@ nv30_vertprog_translate(struct nv30_context *nv30,
 	struct tgsi_parse_context parse;
 	struct nv30_vpc *vpc = NULL;
 
-	vpc = calloc(1, sizeof(struct nv30_vpc));
+	vpc = CALLOC(1, sizeof(struct nv30_vpc));
 	if (!vpc)
 		return;
 	vpc->vp = vp;
-- 
cgit v1.2.3


From 4d9c19d2f7eef263b49485b6e65be9afbe58363a Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sat, 16 Feb 2008 03:04:28 +0100
Subject: nv30: fixes.

---
 src/mesa/pipe/nv30/nv30_vbo.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/pipe/nv30/nv30_vbo.c b/src/mesa/pipe/nv30/nv30_vbo.c
index 414cf55ac8..57fb9bc8a5 100644
--- a/src/mesa/pipe/nv30/nv30_vbo.c
+++ b/src/mesa/pipe/nv30/nv30_vbo.c
@@ -241,7 +241,7 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 	int push, i;
 
 	if (count & 1) {
-		BEGIN_RING(rankine, NV40TCL_VB_ELEMENT_U32, 1);
+		BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
 		OUT_RING  (elts[0]);
 		elts++; count--;
 	}
@@ -249,7 +249,7 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 	while (count) {
 		push = MIN2(count, 2047 * 2);
 
-		BEGIN_RING_NI(rankine, NV40TCL_VB_ELEMENT_U16, push >> 1);
+		BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
 			OUT_RING((elts[i+1] << 16) | elts[i]);
 
-- 
cgit v1.2.3