From 26add9288c88108e3485ffc57c51ea9bdc0ee719 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Sat, 16 Feb 2008 17:23:12 +1100
Subject: nouveau: match gallium code reorginisation.

That was... fun..
---
 src/gallium/drivers/nv40/nv40_shader.h | 554 +++++++++++++++++++++++++++++++++
 1 file changed, 554 insertions(+)
 create mode 100644 src/gallium/drivers/nv40/nv40_shader.h

(limited to 'src/gallium/drivers/nv40/nv40_shader.h')

diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h
new file mode 100644
index 0000000000..5909c70713
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_shader.h
@@ -0,0 +1,554 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#    define NV40_VP_INST_COND_FL                                               0
+#    define NV40_VP_INST_COND_LT                                               1
+#    define NV40_VP_INST_COND_EQ                                               2
+#    define NV40_VP_INST_COND_LE                                               3
+#    define NV40_VP_INST_COND_GT                                               4
+#    define NV40_VP_INST_COND_NE                                               5
+#    define NV40_VP_INST_COND_GE                                               6
+#    define NV40_VP_INST_COND_TR                                               7
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_MUL                                             0x02
+#    define NV40_VP_INST_OP_ADD                                             0x03
+#    define NV40_VP_INST_OP_MAD                                             0x04
+#    define NV40_VP_INST_OP_DP3                                             0x05
+#    define NV40_VP_INST_OP_DPH                                             0x06
+#    define NV40_VP_INST_OP_DP4                                             0x07
+#    define NV40_VP_INST_OP_DST                                             0x08
+#    define NV40_VP_INST_OP_MIN                                             0x09
+#    define NV40_VP_INST_OP_MAX                                             0x0A
+#    define NV40_VP_INST_OP_SLT                                             0x0B
+#    define NV40_VP_INST_OP_SGE                                             0x0C
+#    define NV40_VP_INST_OP_ARL                                             0x0D
+#    define NV40_VP_INST_OP_FRC                                             0x0E
+#    define NV40_VP_INST_OP_FLR                                             0x0F
+#    define NV40_VP_INST_OP_SEQ                                             0x10
+#    define NV40_VP_INST_OP_SFL                                             0x11
+#    define NV40_VP_INST_OP_SGT                                             0x12
+#    define NV40_VP_INST_OP_SLE                                             0x13
+#    define NV40_VP_INST_OP_SNE                                             0x14
+#    define NV40_VP_INST_OP_STR                                             0x15
+#    define NV40_VP_INST_OP_SSG                                             0x16
+#    define NV40_VP_INST_OP_ARR                                             0x17
+#    define NV40_VP_INST_OP_ARA                                             0x18
+#    define NV40_VP_INST_OP_TXL                                             0x19
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_RCP                                             0x02
+#    define NV40_VP_INST_OP_RCC                                             0x03
+#    define NV40_VP_INST_OP_RSQ                                             0x04
+#    define NV40_VP_INST_OP_EXP                                             0x05
+#    define NV40_VP_INST_OP_LOG                                             0x06
+#    define NV40_VP_INST_OP_LIT                                             0x07
+#    define NV40_VP_INST_OP_BRA                                             0x09
+#    define NV40_VP_INST_OP_CAL                                             0x0B
+#    define NV40_VP_INST_OP_RET                                             0x0C
+#    define NV40_VP_INST_OP_LG2                                             0x0D
+#    define NV40_VP_INST_OP_EX2                                             0x0E
+#    define NV40_VP_INST_OP_SIN                                             0x0F
+#    define NV40_VP_INST_OP_COS                                             0x10
+#    define NV40_VP_INST_OP_PUSHA                                           0x13
+#    define NV40_VP_INST_OP_POPA                                            0x14
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#    define NV40_VP_INST_IN_POS                                                0
+#    define NV40_VP_INST_IN_WEIGHT                                             1
+#    define NV40_VP_INST_IN_NORMAL                                             2
+#    define NV40_VP_INST_IN_COL0                                               3
+#    define NV40_VP_INST_IN_COL1                                               4
+#    define NV40_VP_INST_IN_FOGC                                               5
+#    define NV40_VP_INST_IN_TC0                                                8
+#    define NV40_VP_INST_IN_TC(n)                                          (8+n)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST_LAST                                               (1 << 0)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *         0 - Opcode, output reg/mask, ATTRIB source
+ *         1 - Source 0
+ *         2 - Source 1
+ *         3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *                 result.color == R0.xyzw
+ *                 result.depth == R1.z
+ * When the fragprog contains instructions to write depth,
+ * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *                 ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and
+ * SWIZZLE_ONE.
+ *
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as
+ * SWIZZLE_ZERO is implemented simply by not writing to the relevant components
+ * of the destination.
+ *
+ * Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *         LIT
+ *         LRP - MAD+MAD
+ *         SUB - ADD, negate second source
+ *         RSQ - LG2 + EX2
+ *         POW - LG2 + MUL + EX2
+ *         SCS - COS + SIN
+ *         XPD
+ *         DP2 - MUL + ADD
+ *         NRM
+ */
+
+//== Opcode / Destination selection ==
+#define NV40_FP_OP_PROGRAM_END                                          (1 << 0)
+#define NV40_FP_OP_OUT_REG_SHIFT                                               1
+#define NV40_FP_OP_OUT_REG_MASK                                        (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV40_FP_OP_OUT_REG_HALF                                         (1 << 7)
+#define NV40_FP_OP_COND_WRITE_ENABLE                                    (1 << 8)
+#define NV40_FP_OP_OUTMASK_SHIFT                                               9
+#define NV40_FP_OP_OUTMASK_MASK                                       (0xF << 9)
+#    define NV40_FP_OP_OUT_X                                            (1 << 9)
+#    define NV40_FP_OP_OUT_Y                                            (1 <<10)
+#    define NV40_FP_OP_OUT_Z                                            (1 <<11)
+#    define NV40_FP_OP_OUT_W                                            (1 <<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV40_FP_OP_INPUT_SRC_SHIFT                                            13
+#define NV40_FP_OP_INPUT_SRC_MASK                                     (15 << 13)
+#    define NV40_FP_OP_INPUT_SRC_POSITION                                    0x0
+#    define NV40_FP_OP_INPUT_SRC_COL0                                        0x1
+#    define NV40_FP_OP_INPUT_SRC_COL1                                        0x2
+#    define NV40_FP_OP_INPUT_SRC_FOGC                                        0x3
+#    define NV40_FP_OP_INPUT_SRC_TC0                                         0x4
+#    define NV40_FP_OP_INPUT_SRC_TC(n)                                 (0x4 + n)
+#    define NV40_FP_OP_INPUT_SRC_FACING                                      0xE
+#define NV40_FP_OP_TEX_UNIT_SHIFT                                             17
+#define NV40_FP_OP_TEX_UNIT_MASK                                     (0xF << 17)
+#define NV40_FP_OP_PRECISION_SHIFT                                            22
+#define NV40_FP_OP_PRECISION_MASK                                      (3 << 22)
+#   define NV40_FP_PRECISION_FP32                                              0
+#   define NV40_FP_PRECISION_FP16                                              1
+#   define NV40_FP_PRECISION_FX12                                              2
+#define NV40_FP_OP_OPCODE_SHIFT                                               24
+#define NV40_FP_OP_OPCODE_MASK                                      (0x3F << 24)
+#        define NV40_FP_OP_OPCODE_NOP                                       0x00
+#        define NV40_FP_OP_OPCODE_MOV                                       0x01
+#        define NV40_FP_OP_OPCODE_MUL                                       0x02
+#        define NV40_FP_OP_OPCODE_ADD                                       0x03
+#        define NV40_FP_OP_OPCODE_MAD                                       0x04
+#        define NV40_FP_OP_OPCODE_DP3                                       0x05
+#        define NV40_FP_OP_OPCODE_DP4                                       0x06
+#        define NV40_FP_OP_OPCODE_DST                                       0x07
+#        define NV40_FP_OP_OPCODE_MIN                                       0x08
+#        define NV40_FP_OP_OPCODE_MAX                                       0x09
+#        define NV40_FP_OP_OPCODE_SLT                                       0x0A
+#        define NV40_FP_OP_OPCODE_SGE                                       0x0B
+#        define NV40_FP_OP_OPCODE_SLE                                       0x0C
+#        define NV40_FP_OP_OPCODE_SGT                                       0x0D
+#        define NV40_FP_OP_OPCODE_SNE                                       0x0E
+#        define NV40_FP_OP_OPCODE_SEQ                                       0x0F
+#        define NV40_FP_OP_OPCODE_FRC                                       0x10
+#        define NV40_FP_OP_OPCODE_FLR                                       0x11
+#        define NV40_FP_OP_OPCODE_KIL                                       0x12
+#        define NV40_FP_OP_OPCODE_PK4B                                      0x13
+#        define NV40_FP_OP_OPCODE_UP4B                                      0x14
+/* DDX/DDY can only write to XY */
+#        define NV40_FP_OP_OPCODE_DDX                                       0x15
+#        define NV40_FP_OP_OPCODE_DDY                                       0x16
+#        define NV40_FP_OP_OPCODE_TEX                                       0x17
+#        define NV40_FP_OP_OPCODE_TXP                                       0x18
+#        define NV40_FP_OP_OPCODE_TXD                                       0x19
+#        define NV40_FP_OP_OPCODE_RCP                                       0x1A
+#        define NV40_FP_OP_OPCODE_EX2                                       0x1C
+#        define NV40_FP_OP_OPCODE_LG2                                       0x1D
+#        define NV40_FP_OP_OPCODE_STR                                       0x20
+#        define NV40_FP_OP_OPCODE_SFL                                       0x21
+#        define NV40_FP_OP_OPCODE_COS                                       0x22
+#        define NV40_FP_OP_OPCODE_SIN                                       0x23
+#        define NV40_FP_OP_OPCODE_PK2H                                      0x24
+#        define NV40_FP_OP_OPCODE_UP2H                                      0x25
+#        define NV40_FP_OP_OPCODE_PK4UB                                     0x27
+#        define NV40_FP_OP_OPCODE_UP4UB                                     0x28
+#        define NV40_FP_OP_OPCODE_PK2US                                     0x29
+#        define NV40_FP_OP_OPCODE_UP2US                                     0x2A
+#        define NV40_FP_OP_OPCODE_DP2A                                      0x2E
+#        define NV40_FP_OP_OPCODE_TXL                                       0x2F
+#        define NV40_FP_OP_OPCODE_TXB                                       0x31
+#        define NV40_FP_OP_OPCODE_DIV                                       0x3A
+#        define NV40_FP_OP_OPCODE_UNK_LIT                                   0x3C
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#        define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#        define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#        define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#        define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#        define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#        define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+#define NV40_FP_OP_OUT_SAT                                             (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV40_FP_OP_OUT_ABS                                             (1 << 29)
+#define NV40_FP_OP_COND_SWZ_W_SHIFT                                           27
+#define NV40_FP_OP_COND_SWZ_W_MASK                                     (3 << 27)
+#define NV40_FP_OP_COND_SWZ_Z_SHIFT                                           25
+#define NV40_FP_OP_COND_SWZ_Z_MASK                                     (3 << 25)
+#define NV40_FP_OP_COND_SWZ_Y_SHIFT                                           23
+#define NV40_FP_OP_COND_SWZ_Y_MASK                                     (3 << 23)
+#define NV40_FP_OP_COND_SWZ_X_SHIFT                                           21
+#define NV40_FP_OP_COND_SWZ_X_MASK                                     (3 << 21)
+#define NV40_FP_OP_COND_SWZ_ALL_SHIFT                                         21
+#define NV40_FP_OP_COND_SWZ_ALL_MASK                                (0xFF << 21)
+#define NV40_FP_OP_COND_SHIFT                                                 18
+#define NV40_FP_OP_COND_MASK                                        (0x07 << 18)
+#        define NV40_FP_OP_COND_FL                                             0
+#        define NV40_FP_OP_COND_LT                                             1
+#        define NV40_FP_OP_COND_EQ                                             2
+#        define NV40_FP_OP_COND_LE                                             3
+#        define NV40_FP_OP_COND_GT                                             4
+#        define NV40_FP_OP_COND_NE                                             5
+#        define NV40_FP_OP_COND_GE                                             6
+#        define NV40_FP_OP_COND_TR                                             7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NV40_FP_OP_DST_SCALE_SHIFT                                            28
+#define NV40_FP_OP_DST_SCALE_MASK                                      (3 << 28)
+#define NV40_FP_OP_DST_SCALE_1X                                                0
+#define NV40_FP_OP_DST_SCALE_2X                                                1
+#define NV40_FP_OP_DST_SCALE_4X                                                2
+#define NV40_FP_OP_DST_SCALE_8X                                                3
+#define NV40_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV40_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV40_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+// SRC2 high-order
+#define NV40_FP_OP_INDEX_INPUT                                         (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT                                           19
+#define NV40_FP_OP_ADDR_INDEX_MASK                                   (0xF << 19)
+
+//== Register selection ==
+#define NV40_FP_REG_TYPE_SHIFT                                                 0
+#define NV40_FP_REG_TYPE_MASK                                           (3 << 0)
+#        define NV40_FP_REG_TYPE_TEMP                                          0
+#        define NV40_FP_REG_TYPE_INPUT                                         1
+#        define NV40_FP_REG_TYPE_CONST                                         2
+#define NV40_FP_REG_SRC_SHIFT                                                  2
+#define NV40_FP_REG_SRC_MASK                                           (63 << 2)
+#define NV40_FP_REG_SRC_HALF                                            (1 << 8)
+#define NV40_FP_REG_SWZ_ALL_SHIFT                                              9
+#define NV40_FP_REG_SWZ_ALL_MASK                                      (255 << 9)
+#define NV40_FP_REG_SWZ_X_SHIFT                                                9
+#define NV40_FP_REG_SWZ_X_MASK                                          (3 << 9)
+#define NV40_FP_REG_SWZ_Y_SHIFT                                               11
+#define NV40_FP_REG_SWZ_Y_MASK                                         (3 << 11)
+#define NV40_FP_REG_SWZ_Z_SHIFT                                               13
+#define NV40_FP_REG_SWZ_Z_MASK                                         (3 << 13)
+#define NV40_FP_REG_SWZ_W_SHIFT                                               15
+#define NV40_FP_REG_SWZ_W_MASK                                         (3 << 15)
+#        define NV40_FP_SWIZZLE_X                                              0
+#        define NV40_FP_SWIZZLE_Y                                              1
+#        define NV40_FP_SWIZZLE_Z                                              2
+#        define NV40_FP_SWIZZLE_W                                              3
+#define NV40_FP_REG_NEGATE                                             (1 << 17)
+
+#define NV40SR_NONE	0
+#define NV40SR_OUTPUT	1
+#define NV40SR_INPUT	2
+#define NV40SR_TEMP	3
+#define NV40SR_CONST	4
+
+struct nv40_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv40_sreg
+nv40_sr(int type, int index)
+{
+	struct nv40_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w)
+{
+	struct nv40_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_neg(struct nv40_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_abs(struct nv40_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_scale(struct nv40_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
-- 
cgit v1.2.3


From e1cf3f00e546f814effd25e9ccd072c941366444 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Thu, 13 Mar 2008 18:29:56 +1100
Subject: nv40: simple swtnl path (half broken, but getting there)

---
 src/gallium/drivers/nv40/nv40_context.c        |   6 +-
 src/gallium/drivers/nv40/nv40_context.h        |  25 +-
 src/gallium/drivers/nv40/nv40_draw.c           | 334 +++++++++++++++++++++++--
 src/gallium/drivers/nv40/nv40_fragprog.c       |   3 +-
 src/gallium/drivers/nv40/nv40_shader.h         |   2 +
 src/gallium/drivers/nv40/nv40_state.c          |  27 +-
 src/gallium/drivers/nv40/nv40_state.h          |   4 +
 src/gallium/drivers/nv40/nv40_state_clip.c     |   8 +-
 src/gallium/drivers/nv40/nv40_state_emit.c     | 130 +++++++---
 src/gallium/drivers/nv40/nv40_state_viewport.c |  45 +++-
 src/gallium/drivers/nv40/nv40_vbo.c            |  22 +-
 src/gallium/drivers/nv40/nv40_vertprog.c       |  16 +-
 12 files changed, 531 insertions(+), 91 deletions(-)

(limited to 'src/gallium/drivers/nv40/nv40_shader.h')

diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
index 203c843a01..58627443b8 100644
--- a/src/gallium/drivers/nv40/nv40_context.c
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -74,8 +74,12 @@ nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv40_init_state_functions(nv40);
 	nv40_init_miptree_functions(nv40);
 
+	/* Create, configure, and install fallback swtnl path */
 	nv40->draw = draw_create();
-	assert(nv40->draw);
+	draw_wide_point_threshold(nv40->draw, 9999999.0);
+	draw_wide_line_threshold(nv40->draw, 9999999.0);
+	draw_enable_line_stipple(nv40->draw, FALSE);
+	draw_enable_point_sprites(nv40->draw, FALSE);
 	draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40));
 
 	return &nv40->pipe;
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
index 100c678187..02ca20b801 100644
--- a/src/gallium/drivers/nv40/nv40_context.h
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -116,7 +116,20 @@ struct nv40_context {
 
 	/* HW state derived from pipe states */
 	struct nv40_state state;
-	unsigned fallback;
+	struct {
+		struct nv40_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+	unsigned fallback_swrast;
 
 	/* Context state */
 	unsigned dirty;
@@ -166,6 +179,10 @@ extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
 
 /* nv40_draw.c */
 extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
+extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_buffer *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
 
 /* nv40_vertprog.c */
 extern void nv40_vertprog_destroy(struct nv40_context *,
@@ -179,8 +196,9 @@ extern void nv40_fragprog_destroy(struct nv40_context *,
 extern void nv40_fragtex_bind(struct nv40_context *);
 
 /* nv40_state.c and friends */
-extern void nv40_emit_hw_state(struct nv40_context *nv40);
-extern void nv40_state_tex_update(struct nv40_context *nv40);
+extern boolean nv40_state_validate(struct nv40_context *nv40);
+extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
+extern void nv40_state_emit(struct nv40_context *nv40);
 extern struct nv40_state_entry nv40_state_clip;
 extern struct nv40_state_entry nv40_state_rasterizer;
 extern struct nv40_state_entry nv40_state_scissor;
@@ -194,6 +212,7 @@ extern struct nv40_state_entry nv40_state_viewport;
 extern struct nv40_state_entry nv40_state_framebuffer;
 extern struct nv40_state_entry nv40_state_fragtex;
 extern struct nv40_state_entry nv40_state_vbo;
+extern struct nv40_state_entry nv40_state_vtxfmt;
 
 /* nv40_vbo.c */
 extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
index a39bb85e99..ce0e0bc6f2 100644
--- a/src/gallium/drivers/nv40/nv40_draw.c
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -1,62 +1,350 @@
-#include "draw/draw_private.h"
 #include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
 
 #include "nv40_context.h"
+#define NV40_SHADER_NO_FUCKEDNESS
+#include "nv40_shader.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
 
-struct nv40_draw_stage {
-	struct draw_stage draw;
+struct nv40_render_stage {
+	struct draw_stage stage;
 	struct nv40_context *nv40;
+	unsigned prim;
 };
 
+static INLINE struct nv40_render_stage *
+nv40_render_stage(struct draw_stage *stage)
+{
+	return (struct nv40_render_stage *)stage;
+}
+
+static INLINE void
+nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
+{
+	unsigned i;
+
+	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
+		unsigned idx = nv40->swtnl.draw[i];
+		unsigned hw = nv40->swtnl.hw[i];
+
+		switch (nv40->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(curie, 0x1e40 + (hw * 4), 1);
+			OUT_RING  (fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			OUT_RING  (fui(v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(curie, 0x1940 + (hw * 4), 1);
+			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(stage);
+	struct nv40_context *nv40 = rs->nv40;
+	struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (pb->remaining < ((count * 20) + 6)) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING();
+		nv40_state_emit(nv40);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nv40_render_vertex(nv40, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (pb->remaining < ((count * 20) + 6)) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
 static void
-nv40_draw_point(struct draw_stage *draw, struct prim_header *prim)
+nv40_render_point(struct draw_stage *draw, struct prim_header *prim)
 {
-	NOUVEAU_ERR("\n");
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_POINTS, 1);
 }
 
 static void
-nv40_draw_line(struct draw_stage *draw, struct prim_header *prim)
+nv40_render_line(struct draw_stage *draw, struct prim_header *prim)
 {
-	NOUVEAU_ERR("\n");
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_LINES, 2);
 }
 
 static void
-nv40_draw_tri(struct draw_stage *draw, struct prim_header *prim)
+nv40_render_tri(struct draw_stage *draw, struct prim_header *prim)
 {
-	NOUVEAU_ERR("\n");
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_TRIANGLES, 3);
 }
 
 static void
-nv40_draw_flush(struct draw_stage *draw, unsigned flags)
+nv40_render_flush(struct draw_stage *draw, unsigned flags)
 {
+	struct nv40_render_stage *rs = nv40_render_stage(draw);
+	struct nv40_context *nv40 = rs->nv40;
+
+	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
 }
 
 static void
-nv40_draw_reset_stipple_counter(struct draw_stage *draw)
+nv40_render_reset_stipple_counter(struct draw_stage *draw)
 {
-	NOUVEAU_ERR("\n");
 }
 
 static void
-nv40_draw_destroy(struct draw_stage *draw)
+nv40_render_destroy(struct draw_stage *draw)
 {
 	free(draw);
 }
 
+static INLINE void
+emit_mov(struct nv40_vertex_program *vp,
+	 unsigned dst, unsigned src, unsigned vor, unsigned mask)
+{
+	struct nv40_vertex_program_exec *inst;
+
+	vp->insns = realloc(vp->insns,
+			    sizeof(struct nv40_vertex_program_exec) *
+			    ++vp->nr_insns);
+	inst = &vp->insns[vp->nr_insns - 1];
+
+	inst->data[0] = 0x401f9c6c;
+	inst->data[1] = 0x0040000d | (src << 8);
+	inst->data[2] = 0x8106c083;
+	inst->data[3] = 0x6041ff80 | (dst << 2) | (mask << 13);
+	inst->const_index = -1;
+	inst->has_branch_offset = FALSE;
+
+	vp->ir |= (1 << src);
+	if (vor != ~0)
+		vp->or |= (1 << vor);
+}
+
+static struct nv40_vertex_program *
+create_drawvp(struct nv40_context *nv40)
+{
+	struct nv40_vertex_program *vp = CALLOC_STRUCT(nv40_vertex_program);
+	unsigned i;
+
+	emit_mov(vp, NV40_VP_INST_DEST_POS, 0, ~0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL0, 3, 0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL1, 4, 1, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC0, 3, 2, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC1, 4, 3, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_FOGC, 5, 4, 0x8);
+	for (i = 0; i < 8; i++)
+		emit_mov(vp, NV40_VP_INST_DEST_TC(i), 8 + i, 14 + i, 0xf);
+
+	vp->insns[vp->nr_insns - 1].data[3] |= 1;
+	vp->translated = TRUE;
+	return vp;
+}
+
 struct draw_stage *
 nv40_draw_render_stage(struct nv40_context *nv40)
 {
-	struct nv40_draw_stage *nv40draw = CALLOC_STRUCT(nv40_draw_stage);
+	struct nv40_render_stage *render = CALLOC_STRUCT(nv40_render_stage);
+
+	if (!nv40->swtnl.vertprog)
+		nv40->swtnl.vertprog = create_drawvp(nv40);
+
+	render->nv40 = nv40;
+	render->stage.draw = nv40->draw;
+	render->stage.point = nv40_render_point;
+	render->stage.line = nv40_render_line;
+	render->stage.tri = nv40_render_tri;
+	render->stage.flush = nv40_render_flush;
+	render->stage.reset_stipple_counter = nv40_render_reset_stipple_counter;
+	render->stage.destroy = nv40_render_destroy;
+
+	return &render->stage;
+}
+
+boolean
+nv40_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	unsigned i;
+	void *map;
+
+	if (!nv40_state_validate_swtnl(nv40))
+		return FALSE;
+	nv40_state_emit(nv40);
 
-	nv40draw->nv40 = nv40;
-	nv40draw->draw.draw = nv40->draw;
-	nv40draw->draw.point = nv40_draw_point;
-	nv40draw->draw.line = nv40_draw_line;
-	nv40draw->draw.tri = nv40_draw_tri;
-	nv40draw->draw.flush = nv40_draw_flush;
-	nv40draw->draw.reset_stipple_counter = nv40_draw_reset_stipple_counter;
-	nv40draw->draw.destroy = nv40_draw_destroy;
+	for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+		if (!nv40->vtxbuf[i].buffer)
+			continue;
+		map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer,
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_vertex_buffer(nv40->draw, i, map);
+	}
 
-	return &nv40draw->draw;
+	if (idxbuf) {
+		map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nv40->draw, 0, NULL);
+	}
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX]) {
+		map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX],
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_constant_buffer(nv40->draw, map);
+	}
+
+	draw_arrays(nv40->draw, mode, start, count);
+
+	for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+		if (!nv40->vtxbuf[i].buffer)
+			continue;
+		ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer);
+	}
+
+	if (idxbuf)
+		ws->buffer_unmap(ws, idxbuf);
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX])
+		ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]);
+
+	draw_flush(nv40->draw);
+	pipe->flush(pipe, 0);
+
+	return TRUE;
 }
 
+static INLINE void
+emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned a = nv40->swtnl.nr_attribs++;
+
+	nv40->swtnl.hw[a] = hw;
+	nv40->swtnl.emit[a] = emit;
+	nv40->swtnl.draw[a] = draw_out;
+}
+
+static boolean
+nv40_state_vtxfmt_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nv40->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nv40, 3 + i, EMIT_4UB, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nv40, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nv40, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nv40, 0, EMIT_4F, TGSI_SEMANTIC_POSITION, 0);
+
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vtxfmt = {
+	.validate = nv40_state_vtxfmt_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
index 953f9cd908..82dbcd3eef 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -797,9 +797,10 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 	if (fp->translated)
 		goto update_constants;
 
+	nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG;
 	nv40_fragprog_translate(nv40, fp);
 	if (!fp->translated) {
-		nv40->fallback |= NV40_FALLBACK_RAST;
+		nv40->fallback_swrast |= NV40_NEW_FRAGPROG;
 		return FALSE;
 	}
 
diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h
index 5909c70713..854dccf548 100644
--- a/src/gallium/drivers/nv40/nv40_shader.h
+++ b/src/gallium/drivers/nv40/nv40_shader.h
@@ -476,6 +476,7 @@
 #        define NV40_FP_SWIZZLE_W                                              3
 #define NV40_FP_REG_NEGATE                                             (1 << 17)
 
+#ifndef NV40_SHADER_NO_FUCKEDNESS
 #define NV40SR_NONE	0
 #define NV40SR_OUTPUT	1
 #define NV40SR_INPUT	2
@@ -550,5 +551,6 @@ nv40_sr_scale(struct nv40_sreg src, int scale)
 	src.dst_scale = scale;
 	return src;
 }
+#endif
 
 #endif
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
index 321d5de041..3eafbece30 100644
--- a/src/gallium/drivers/nv40/nv40_state.c
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -3,6 +3,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 
+#include "draw/draw_context.h"
+
 #include "nv40_context.h"
 #include "nv40_state.h"
 
@@ -345,7 +347,7 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
 			break;
 		default:
-			so_data(so, 0);
+			so_data(so, NV40TCL_CULL_FACE_BACK);
 			break;
 		}
 		so_data(so, NV40TCL_FRONT_FACE_CCW);
@@ -363,13 +365,13 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
 			break;
 		default:
-			so_data(so, 0);
+			so_data(so, NV40TCL_CULL_FACE_BACK);
 			break;
 		}
 		so_data(so, NV40TCL_FRONT_FACE_CW);
 	}
 	so_data(so, cso->poly_smooth ? 1 : 0);
-	so_data(so, cso->cull_mode != PIPE_WINDING_NONE ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
 
 	so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
@@ -419,6 +421,9 @@ static void
 nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_rasterizer_state *rsso = hwcso;
+
+	draw_set_rasterizer_state(nv40->draw, &rsso->pipe);
 
 	nv40->rasterizer = hwcso;
 	nv40->dirty |= NV40_NEW_RAST;
@@ -508,10 +513,12 @@ static void *
 nv40_vp_state_create(struct pipe_context *pipe,
 		     const struct pipe_shader_state *cso)
 {
+	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_vertex_program *vp;
 
 	vp = CALLOC(1, sizeof(struct nv40_vertex_program));
 	vp->pipe = *cso;
+	vp->draw = draw_create_vertex_shader(nv40->draw, &vp->pipe);
 
 	return (void *)vp;
 }
@@ -520,6 +527,9 @@ static void
 nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp = hwcso;
+
+	draw_bind_vertex_shader(nv40->draw, vp ? vp->draw : NULL);
 
 	nv40->vertprog = hwcso;
 	nv40->dirty |= NV40_NEW_VERTPROG;
@@ -531,6 +541,7 @@ nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso)
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_vertex_program *vp = hwcso;
 
+	draw_delete_vertex_shader(nv40->draw, vp->draw);
 	nv40_vertprog_destroy(nv40, vp);
 	FREE(vp);
 }
@@ -544,6 +555,8 @@ nv40_fp_state_create(struct pipe_context *pipe,
 	fp = CALLOC(1, sizeof(struct nv40_fragment_program));
 	fp->pipe = *cso;
 
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
 	return (void *)fp;
 }
 
@@ -582,6 +595,8 @@ nv40_set_clip_state(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
+	draw_set_clip_state(nv40->draw, clip);
+
 	nv40->clip = *clip;
 	nv40->dirty |= NV40_NEW_UCP;
 }
@@ -638,6 +653,8 @@ nv40_set_viewport_state(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
+	draw_set_viewport_state(nv40->draw, vpt);
+
 	nv40->viewport = *vpt;
 	nv40->dirty |= NV40_NEW_VIEWPORT;
 }
@@ -648,6 +665,8 @@ nv40_set_vertex_buffer(struct pipe_context *pipe, unsigned index,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
+	draw_set_vertex_buffer(nv40->draw, index, vb);
+
 	nv40->vtxbuf[index] = *vb;
 	nv40->dirty |= NV40_NEW_ARRAYS;
 }
@@ -658,6 +677,8 @@ nv40_set_vertex_element(struct pipe_context *pipe, unsigned index,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
+	draw_set_vertex_element(nv40->draw, index, ve);
+
 	nv40->vtxelt[index] = *ve;
 	nv40->dirty |= NV40_NEW_ARRAYS;
 }
diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h
index a02ea0c878..ab2866eb7a 100644
--- a/src/gallium/drivers/nv40/nv40_state.h
+++ b/src/gallium/drivers/nv40/nv40_state.h
@@ -2,6 +2,7 @@
 #define __NV40_STATE_H__
 
 #include "pipe/p_state.h"
+#include "tgsi/util/tgsi_scan.h"
 
 struct nv40_sampler_state {
 	uint32_t fmt;
@@ -25,6 +26,8 @@ struct nv40_vertex_program_data {
 struct nv40_vertex_program {
 	struct pipe_shader_state pipe;
 
+	struct draw_vertex_shader *draw;
+
 	boolean translated;
 	struct nv40_vertex_program_exec *insns;
 	unsigned nr_insns;
@@ -49,6 +52,7 @@ struct nv40_fragment_program_data {
 
 struct nv40_fragment_program {
 	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
 
 	boolean translated;
 	unsigned samplers;
diff --git a/src/gallium/drivers/nv40/nv40_state_clip.c b/src/gallium/drivers/nv40/nv40_state_clip.c
index 93e690161f..c52390f9ed 100644
--- a/src/gallium/drivers/nv40/nv40_state_clip.c
+++ b/src/gallium/drivers/nv40/nv40_state_clip.c
@@ -3,8 +3,12 @@
 static boolean
 nv40_state_clip_validate(struct nv40_context *nv40)
 {
-	if (nv40->clip.nr)
-		nv40->fallback |= NV40_FALLBACK_TNL;
+
+	if (nv40->render_mode == HW) {
+		nv40->fallback_swtnl &= ~NV40_NEW_UCP;
+		if (nv40->clip.nr)
+			nv40->fallback_swtnl |= NV40_NEW_UCP;
+	}
 
 	return FALSE;
 }
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
index 9f268640e0..056238cc83 100644
--- a/src/gallium/drivers/nv40/nv40_state_emit.c
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -1,5 +1,6 @@
 #include "nv40_context.h"
 #include "nv40_state.h"
+#include "draw/draw_context.h"
 
 static struct nv40_state_entry *render_states[] = {
 	&nv40_state_framebuffer,
@@ -18,15 +19,27 @@ static struct nv40_state_entry *render_states[] = {
 	NULL
 };
 
+static struct nv40_state_entry *swtnl_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_clip,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vtxfmt,
+	NULL
+};
+
 static void
-nv40_state_validate(struct nv40_context *nv40)
+nv40_state_do_validate(struct nv40_context *nv40,
+		       struct nv40_state_entry **states)
 {
-	struct nv40_state_entry **states = render_states;
-	unsigned last_fallback;
-
-	last_fallback = nv40->fallback;
-	nv40->fallback = 0;
-
 	while (*states) {
 		struct nv40_state_entry *e = *states;
 
@@ -38,32 +51,15 @@ nv40_state_validate(struct nv40_context *nv40)
 		states++;
 	}
 	nv40->dirty = 0;
-
-	if (nv40->fallback & NV40_FALLBACK_TNL &&
-	    !(last_fallback & NV40_FALLBACK_TNL)) {
-		NOUVEAU_ERR("XXX: hwtnl->swtnl\n");
-	} else
-	if (last_fallback & NV40_FALLBACK_TNL &&
-	    !(nv40->fallback & NV40_FALLBACK_TNL)) {
-		NOUVEAU_ERR("XXX: swtnl->hwtnl\n");
-	}
-
-	if (nv40->fallback & NV40_FALLBACK_RAST &&
-	    !(last_fallback & NV40_FALLBACK_RAST)) {
-		NOUVEAU_ERR("XXX: hwrast->swrast\n");
-	} else
-	if (last_fallback & NV40_FALLBACK_RAST &&
-	    !(nv40->fallback & NV40_FALLBACK_RAST)) {
-		NOUVEAU_ERR("XXX: swrast->hwrast\n");
-	}
 }
 
-static void
+void
 nv40_state_emit(struct nv40_context *nv40)
 {
 	struct nv40_state *state = &nv40->state;
 	struct nv40_screen *screen = nv40->screen;
 	unsigned i, samplers;
+	uint64 states;
 
 	if (nv40->pctx_id != screen->cur_pctx) {
 		for (i = 0; i < NV40_STATE_MAX; i++) {
@@ -74,14 +70,24 @@ nv40_state_emit(struct nv40_context *nv40)
 		screen->cur_pctx = nv40->pctx_id;
 	}
 
-	while (state->dirty) {
-		unsigned idx = ffsll(state->dirty) - 1;
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv40->screen->state[i]);
+		so_emit(nv40->nvws, nv40->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
 
-		so_ref (state->hw[idx], &nv40->screen->state[idx]);
-		so_emit(nv40->nvws, nv40->screen->state[idx]);
-		state->dirty &= ~(1ULL << idx);
+	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
+			    (1ULL << NV40_STATE_FRAGTEX0))) {
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
 	}
 
+	state->dirty = 0;
+
 	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]);
 	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
 		if (!(samplers & (1 << i)))
@@ -91,18 +97,62 @@ nv40_state_emit(struct nv40_context *nv40)
 		samplers &= ~(1ULL << i);
 	}
 	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]);
-	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
+	if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW)
+		so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
 }
 
-void
-nv40_emit_hw_state(struct nv40_context *nv40)
+boolean
+nv40_state_validate(struct nv40_context *nv40)
 {
-	nv40_state_validate(nv40);
-	nv40_state_emit(nv40);
+	boolean was_sw = nv40->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv40->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv40->fallback_swtnl & nv40->dirty)
+				!= nv40->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv40->pipe.flush(&nv40->pipe, 0);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS |
+				NV40_NEW_UCP);
+		nv40->render_mode = HW;
+	}
+
+	nv40_state_do_validate(nv40, render_states);
+	if (nv40->fallback_swtnl || nv40->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nv40_state_validate_swtnl(struct nv40_context *nv40)
+{
+	/* Setup for swtnl */
+	if (nv40->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nv40->fallback_swtnl);
+		nv40->pipe.flush(&nv40->pipe, 0);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS |
+				NV40_NEW_UCP);
+		nv40->render_mode = SWTNL;
+	}
+
+	nv40_state_do_validate(nv40, swtnl_states);
+	if (nv40->fallback_swrast) {
+		NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nv40->fallback_swrast);
+		return FALSE;
+	}
 
-	BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-	OUT_RING  (2);
-	BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-	OUT_RING  (1);
+	return TRUE;
 }
 
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
index 3a32533907..9e5c7a72a7 100644
--- a/src/gallium/drivers/nv40/nv40_state_viewport.c
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -3,18 +3,43 @@
 static boolean
 nv40_state_viewport_validate(struct nv40_context *nv40)
 {
-	struct nouveau_stateobj *so = so_new(9, 0);
+	struct nouveau_stateobj *so = so_new(11, 0);
 	struct pipe_viewport_state *vpt = &nv40->viewport;
 
-	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_TRANSLATE_X, 8);
-	so_data  (so, fui(vpt->translate[0]));
-	so_data  (so, fui(vpt->translate[1]));
-	so_data  (so, fui(vpt->translate[2]));
-	so_data  (so, fui(vpt->translate[3]));
-	so_data  (so, fui(vpt->scale[0]));
-	so_data  (so, fui(vpt->scale[1]));
-	so_data  (so, fui(vpt->scale[2]));
-	so_data  (so, fui(vpt->scale[3]));
+	if (nv40->render_mode == HW) {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 1);
+	} else {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 0x110);
+	}
 
 	so_ref(so, &nv40->state.hw[NV40_STATE_VIEWPORT]);
 	return TRUE;
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
index f16afc23b8..fad423fdf8 100644
--- a/src/gallium/drivers/nv40/nv40_vbo.c
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -8,6 +8,8 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
+#define FORCE_SWTNL 0
+
 static INLINE int
 nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
 {
@@ -165,7 +167,11 @@ nv40_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	unsigned nr;
 
 	nv40_vbo_set_idxbuf(nv40, NULL, 0);
-	nv40_emit_hw_state(nv40);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+	nv40_state_emit(nv40);
 
 	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -274,7 +280,7 @@ nv40_draw_elements_inline(struct pipe_context *pipe,
 	struct pipe_winsys *ws = pipe->winsys;
 	void *map;
 
-	nv40_emit_hw_state(nv40);
+	nv40_state_emit(nv40);
 
 	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
 	if (!ib) {
@@ -315,7 +321,7 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	unsigned nr;
 
-	nv40_emit_hw_state(nv40);
+	nv40_state_emit(nv40);
 
 	BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
 	OUT_RING  (nvgl_primitive(mode));
@@ -352,8 +358,16 @@ nv40_draw_elements(struct pipe_context *pipe,
 		   unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+	nv40_state_emit(nv40);
 
-	if (nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize)) {
+	if (idxbuf) {
 		nv40_draw_elements_vbo(pipe, mode, start, count);
 	} else {
 		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
index 3d730c1a32..9f1ee575ce 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -634,21 +634,29 @@ out_err:
 static boolean
 nv40_vertprog_validate(struct nv40_context *nv40)
 { 
-	struct nv40_vertex_program *vp = nv40->vertprog;
-	struct pipe_buffer *constbuf =
-		nv40->constbuf[PIPE_SHADER_VERTEX];
 	struct nouveau_winsys *nvws = nv40->nvws;
 	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nv40_vertex_program *vp;
+	struct pipe_buffer *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
 	int i;
 
+	if (nv40->render_mode == HW) {
+		vp = nv40->vertprog;
+		constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
+	} else {
+		vp = nv40->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
 	/* Translate TGSI shader into hw bytecode */
 	if (vp->translated)
 		goto check_gpu_resources;
 
+	nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
 	nv40_vertprog_translate(nv40, vp);
 	if (!vp->translated) {
-		nv40->fallback |= NV40_FALLBACK_TNL;
+		nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
 		return FALSE;
 	}
 
-- 
cgit v1.2.3