From 26add9288c88108e3485ffc57c51ea9bdc0ee719 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Sat, 16 Feb 2008 17:23:12 +1100 Subject: nouveau: match gallium code reorginisation. That was... fun.. --- src/gallium/drivers/nv40/nv40_shader.h | 554 +++++++++++++++++++++++++++++++++ 1 file changed, 554 insertions(+) create mode 100644 src/gallium/drivers/nv40/nv40_shader.h (limited to 'src/gallium/drivers/nv40/nv40_shader.h') diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h new file mode 100644 index 0000000000..5909c70713 --- /dev/null +++ b/src/gallium/drivers/nv40/nv40_shader.h @@ -0,0 +1,554 @@ +#ifndef __NV40_SHADER_H__ +#define __NV40_SHADER_H__ + +/* Vertex programs instruction set + * + * The NV40 instruction set is very similar to NV30. Most fields are in + * a slightly different position in the instruction however. + * + * Merged instructions + * In some cases it is possible to put two instructions into one opcode + * slot. The rules for when this is OK is not entirely clear to me yet. + * + * There are separate writemasks and dest temp register fields for each + * grouping of instructions. There is however only one field with the + * ID of a result register. Writing to temp/result regs is selected by + * setting VEC_RESULT/SCA_RESULT. + * + * Temporary registers + * The source/dest temp register fields have been extended by 1 bit, to + * give a total of 32 temporary registers. + * + * Relative Addressing + * NV40 can use an address register to index into vertex attribute regs. + * This is done by putting the offset value into INPUT_SRC and setting + * the INDEX_INPUT flag. + * + * Conditional execution (see NV_vertex_program{2,3} for details) + * There is a second condition code register on NV40, it's use is enabled + * by setting the COND_REG_SELECT_1 flag. + * + * Texture lookup + * TODO + */ + +/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */ +#define NV40_VP_INST_VEC_RESULT (1 << 30) +/* uncertain.. */ +#define NV40_VP_INST_COND_UPDATE_ENABLE ((1 << 14)|1<<29) +/* use address reg as index into attribs */ +#define NV40_VP_INST_INDEX_INPUT (1 << 27) +#define NV40_VP_INST_COND_REG_SELECT_1 (1 << 25) +#define NV40_VP_INST_ADDR_REG_SELECT_1 (1 << 24) +#define NV40_VP_INST_SRC2_ABS (1 << 23) +#define NV40_VP_INST_SRC1_ABS (1 << 22) +#define NV40_VP_INST_SRC0_ABS (1 << 21) +#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT 15 +#define NV40_VP_INST_VEC_DEST_TEMP_MASK (0x1F << 15) +#define NV40_VP_INST_COND_TEST_ENABLE (1 << 13) +#define NV40_VP_INST_COND_SHIFT 10 +#define NV40_VP_INST_COND_MASK (0x7 << 10) +# define NV40_VP_INST_COND_FL 0 +# define NV40_VP_INST_COND_LT 1 +# define NV40_VP_INST_COND_EQ 2 +# define NV40_VP_INST_COND_LE 3 +# define NV40_VP_INST_COND_GT 4 +# define NV40_VP_INST_COND_NE 5 +# define NV40_VP_INST_COND_GE 6 +# define NV40_VP_INST_COND_TR 7 +#define NV40_VP_INST_COND_SWZ_X_SHIFT 8 +#define NV40_VP_INST_COND_SWZ_X_MASK (3 << 8) +#define NV40_VP_INST_COND_SWZ_Y_SHIFT 6 +#define NV40_VP_INST_COND_SWZ_Y_MASK (3 << 6) +#define NV40_VP_INST_COND_SWZ_Z_SHIFT 4 +#define NV40_VP_INST_COND_SWZ_Z_MASK (3 << 4) +#define NV40_VP_INST_COND_SWZ_W_SHIFT 2 +#define NV40_VP_INST_COND_SWZ_W_MASK (3 << 2) +#define NV40_VP_INST_COND_SWZ_ALL_SHIFT 2 +#define NV40_VP_INST_COND_SWZ_ALL_MASK (0xFF << 2) +#define NV40_VP_INST_ADDR_SWZ_SHIFT 0 +#define NV40_VP_INST_ADDR_SWZ_MASK (0x03 << 0) +#define NV40_VP_INST0_KNOWN ( \ + NV40_VP_INST_INDEX_INPUT | \ + NV40_VP_INST_COND_REG_SELECT_1 | \ + NV40_VP_INST_ADDR_REG_SELECT_1 | \ + NV40_VP_INST_SRC2_ABS | \ + NV40_VP_INST_SRC1_ABS | \ + NV40_VP_INST_SRC0_ABS | \ + NV40_VP_INST_VEC_DEST_TEMP_MASK | \ + NV40_VP_INST_COND_TEST_ENABLE | \ + NV40_VP_INST_COND_MASK | \ + NV40_VP_INST_COND_SWZ_ALL_MASK | \ + NV40_VP_INST_ADDR_SWZ_MASK) + +/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */ +#define NV40_VP_INST_VEC_OPCODE_SHIFT 22 +#define NV40_VP_INST_VEC_OPCODE_MASK (0x1F << 22) +# define NV40_VP_INST_OP_NOP 0x00 +# define NV40_VP_INST_OP_MOV 0x01 +# define NV40_VP_INST_OP_MUL 0x02 +# define NV40_VP_INST_OP_ADD 0x03 +# define NV40_VP_INST_OP_MAD 0x04 +# define NV40_VP_INST_OP_DP3 0x05 +# define NV40_VP_INST_OP_DPH 0x06 +# define NV40_VP_INST_OP_DP4 0x07 +# define NV40_VP_INST_OP_DST 0x08 +# define NV40_VP_INST_OP_MIN 0x09 +# define NV40_VP_INST_OP_MAX 0x0A +# define NV40_VP_INST_OP_SLT 0x0B +# define NV40_VP_INST_OP_SGE 0x0C +# define NV40_VP_INST_OP_ARL 0x0D +# define NV40_VP_INST_OP_FRC 0x0E +# define NV40_VP_INST_OP_FLR 0x0F +# define NV40_VP_INST_OP_SEQ 0x10 +# define NV40_VP_INST_OP_SFL 0x11 +# define NV40_VP_INST_OP_SGT 0x12 +# define NV40_VP_INST_OP_SLE 0x13 +# define NV40_VP_INST_OP_SNE 0x14 +# define NV40_VP_INST_OP_STR 0x15 +# define NV40_VP_INST_OP_SSG 0x16 +# define NV40_VP_INST_OP_ARR 0x17 +# define NV40_VP_INST_OP_ARA 0x18 +# define NV40_VP_INST_OP_TXL 0x19 +#define NV40_VP_INST_SCA_OPCODE_SHIFT 27 +#define NV40_VP_INST_SCA_OPCODE_MASK (0x1F << 27) +# define NV40_VP_INST_OP_NOP 0x00 +# define NV40_VP_INST_OP_MOV 0x01 +# define NV40_VP_INST_OP_RCP 0x02 +# define NV40_VP_INST_OP_RCC 0x03 +# define NV40_VP_INST_OP_RSQ 0x04 +# define NV40_VP_INST_OP_EXP 0x05 +# define NV40_VP_INST_OP_LOG 0x06 +# define NV40_VP_INST_OP_LIT 0x07 +# define NV40_VP_INST_OP_BRA 0x09 +# define NV40_VP_INST_OP_CAL 0x0B +# define NV40_VP_INST_OP_RET 0x0C +# define NV40_VP_INST_OP_LG2 0x0D +# define NV40_VP_INST_OP_EX2 0x0E +# define NV40_VP_INST_OP_SIN 0x0F +# define NV40_VP_INST_OP_COS 0x10 +# define NV40_VP_INST_OP_PUSHA 0x13 +# define NV40_VP_INST_OP_POPA 0x14 +#define NV40_VP_INST_CONST_SRC_SHIFT 12 +#define NV40_VP_INST_CONST_SRC_MASK (0xFF << 12) +#define NV40_VP_INST_INPUT_SRC_SHIFT 8 +#define NV40_VP_INST_INPUT_SRC_MASK (0x0F << 8) +# define NV40_VP_INST_IN_POS 0 +# define NV40_VP_INST_IN_WEIGHT 1 +# define NV40_VP_INST_IN_NORMAL 2 +# define NV40_VP_INST_IN_COL0 3 +# define NV40_VP_INST_IN_COL1 4 +# define NV40_VP_INST_IN_FOGC 5 +# define NV40_VP_INST_IN_TC0 8 +# define NV40_VP_INST_IN_TC(n) (8+n) +#define NV40_VP_INST_SRC0H_SHIFT 0 +#define NV40_VP_INST_SRC0H_MASK (0xFF << 0) +#define NV40_VP_INST1_KNOWN ( \ + NV40_VP_INST_VEC_OPCODE_MASK | \ + NV40_VP_INST_SCA_OPCODE_MASK | \ + NV40_VP_INST_CONST_SRC_MASK | \ + NV40_VP_INST_INPUT_SRC_MASK | \ + NV40_VP_INST_SRC0H_MASK \ + ) + +/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */ +#define NV40_VP_INST_SRC0L_SHIFT 23 +#define NV40_VP_INST_SRC0L_MASK (0x1FF << 23) +#define NV40_VP_INST_SRC1_SHIFT 6 +#define NV40_VP_INST_SRC1_MASK (0x1FFFF << 6) +#define NV40_VP_INST_SRC2H_SHIFT 0 +#define NV40_VP_INST_SRC2H_MASK (0x3F << 0) +#define NV40_VP_INST_IADDRH_SHIFT 0 +#define NV40_VP_INST_IADDRH_MASK (0x1F << 0) + +/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */ +#define NV40_VP_INST_IADDRL_SHIFT 29 +#define NV40_VP_INST_IADDRL_MASK (7 << 29) +#define NV40_VP_INST_SRC2L_SHIFT 21 +#define NV40_VP_INST_SRC2L_MASK (0x7FF << 21) +#define NV40_VP_INST_SCA_WRITEMASK_SHIFT 17 +#define NV40_VP_INST_SCA_WRITEMASK_MASK (0xF << 17) +# define NV40_VP_INST_SCA_WRITEMASK_X (1 << 20) +# define NV40_VP_INST_SCA_WRITEMASK_Y (1 << 19) +# define NV40_VP_INST_SCA_WRITEMASK_Z (1 << 18) +# define NV40_VP_INST_SCA_WRITEMASK_W (1 << 17) +#define NV40_VP_INST_VEC_WRITEMASK_SHIFT 13 +#define NV40_VP_INST_VEC_WRITEMASK_MASK (0xF << 13) +# define NV40_VP_INST_VEC_WRITEMASK_X (1 << 16) +# define NV40_VP_INST_VEC_WRITEMASK_Y (1 << 15) +# define NV40_VP_INST_VEC_WRITEMASK_Z (1 << 14) +# define NV40_VP_INST_VEC_WRITEMASK_W (1 << 13) +#define NV40_VP_INST_SCA_RESULT (1 << 12) +#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT 7 +#define NV40_VP_INST_SCA_DEST_TEMP_MASK (0x1F << 7) +#define NV40_VP_INST_DEST_SHIFT 2 +#define NV40_VP_INST_DEST_MASK (31 << 2) +# define NV40_VP_INST_DEST_POS 0 +# define NV40_VP_INST_DEST_COL0 1 +# define NV40_VP_INST_DEST_COL1 2 +# define NV40_VP_INST_DEST_BFC0 3 +# define NV40_VP_INST_DEST_BFC1 4 +# define NV40_VP_INST_DEST_FOGC 5 +# define NV40_VP_INST_DEST_PSZ 6 +# define NV40_VP_INST_DEST_TC0 7 +# define NV40_VP_INST_DEST_TC(n) (7+n) +# define NV40_VP_INST_DEST_TEMP 0x1F +#define NV40_VP_INST_INDEX_CONST (1 << 1) +#define NV40_VP_INST_LAST (1 << 0) +#define NV40_VP_INST3_KNOWN ( \ + NV40_VP_INST_SRC2L_MASK |\ + NV40_VP_INST_SCA_WRITEMASK_MASK |\ + NV40_VP_INST_VEC_WRITEMASK_MASK |\ + NV40_VP_INST_SCA_DEST_TEMP_MASK |\ + NV40_VP_INST_DEST_MASK |\ + NV40_VP_INST_INDEX_CONST) + +/* Useful to split the source selection regs into their pieces */ +#define NV40_VP_SRC0_HIGH_SHIFT 9 +#define NV40_VP_SRC0_HIGH_MASK 0x0001FE00 +#define NV40_VP_SRC0_LOW_MASK 0x000001FF +#define NV40_VP_SRC2_HIGH_SHIFT 11 +#define NV40_VP_SRC2_HIGH_MASK 0x0001F800 +#define NV40_VP_SRC2_LOW_MASK 0x000007FF + +/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */ +#define NV40_VP_SRC_NEGATE (1 << 16) +#define NV40_VP_SRC_SWZ_X_SHIFT 14 +#define NV40_VP_SRC_SWZ_X_MASK (3 << 14) +#define NV40_VP_SRC_SWZ_Y_SHIFT 12 +#define NV40_VP_SRC_SWZ_Y_MASK (3 << 12) +#define NV40_VP_SRC_SWZ_Z_SHIFT 10 +#define NV40_VP_SRC_SWZ_Z_MASK (3 << 10) +#define NV40_VP_SRC_SWZ_W_SHIFT 8 +#define NV40_VP_SRC_SWZ_W_MASK (3 << 8) +#define NV40_VP_SRC_SWZ_ALL_SHIFT 8 +#define NV40_VP_SRC_SWZ_ALL_MASK (0xFF << 8) +#define NV40_VP_SRC_TEMP_SRC_SHIFT 2 +#define NV40_VP_SRC_TEMP_SRC_MASK (0x1F << 2) +#define NV40_VP_SRC_REG_TYPE_SHIFT 0 +#define NV40_VP_SRC_REG_TYPE_MASK (3 << 0) +# define NV40_VP_SRC_REG_TYPE_UNK0 0 +# define NV40_VP_SRC_REG_TYPE_TEMP 1 +# define NV40_VP_SRC_REG_TYPE_INPUT 2 +# define NV40_VP_SRC_REG_TYPE_CONST 3 + + +/* + * Each fragment program opcode appears to be comprised of 4 32-bit values. + * + * 0 - Opcode, output reg/mask, ATTRIB source + * 1 - Source 0 + * 2 - Source 1 + * 3 - Source 2 + * + * There appears to be no special difference between result regs and temp regs. + * result.color == R0.xyzw + * result.depth == R1.z + * When the fragprog contains instructions to write depth, + * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1. + * + * Constants are inserted directly after the instruction that uses them. + * + * It appears that it's not possible to use two input registers in one + * instruction as the input sourcing is done in the instruction dword + * and not the source selection dwords. As such instructions such as: + * + * ADD result.color, fragment.color, fragment.texcoord[0]; + * + * must be split into two MOV's and then an ADD (nvidia does this) but + * I'm not sure why it's not just one MOV and then source the second input + * in the ADD instruction.. + * + * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary + * negation requires multiplication with a const. + * + * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and + * SWIZZLE_ONE. + * + * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as + * SWIZZLE_ZERO is implemented simply by not writing to the relevant components + * of the destination. + * + * Looping + * Loops appear to be fairly expensive on NV40 at least, the proprietary + * driver goes to a lot of effort to avoid using the native looping + * instructions. If the total number of *executed* instructions between + * REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop. + * The maximum loop count is 255. + * + * Conditional execution + * TODO + * + * Non-native instructions: + * LIT + * LRP - MAD+MAD + * SUB - ADD, negate second source + * RSQ - LG2 + EX2 + * POW - LG2 + MUL + EX2 + * SCS - COS + SIN + * XPD + * DP2 - MUL + ADD + * NRM + */ + +//== Opcode / Destination selection == +#define NV40_FP_OP_PROGRAM_END (1 << 0) +#define NV40_FP_OP_OUT_REG_SHIFT 1 +#define NV40_FP_OP_OUT_REG_MASK (63 << 1) +/* Needs to be set when writing outputs to get expected result.. */ +#define NV40_FP_OP_OUT_REG_HALF (1 << 7) +#define NV40_FP_OP_COND_WRITE_ENABLE (1 << 8) +#define NV40_FP_OP_OUTMASK_SHIFT 9 +#define NV40_FP_OP_OUTMASK_MASK (0xF << 9) +# define NV40_FP_OP_OUT_X (1 << 9) +# define NV40_FP_OP_OUT_Y (1 <<10) +# define NV40_FP_OP_OUT_Z (1 <<11) +# define NV40_FP_OP_OUT_W (1 <<12) +/* Uncertain about these, especially the input_src values.. it's possible that + * they can be dynamically changed. + */ +#define NV40_FP_OP_INPUT_SRC_SHIFT 13 +#define NV40_FP_OP_INPUT_SRC_MASK (15 << 13) +# define NV40_FP_OP_INPUT_SRC_POSITION 0x0 +# define NV40_FP_OP_INPUT_SRC_COL0 0x1 +# define NV40_FP_OP_INPUT_SRC_COL1 0x2 +# define NV40_FP_OP_INPUT_SRC_FOGC 0x3 +# define NV40_FP_OP_INPUT_SRC_TC0 0x4 +# define NV40_FP_OP_INPUT_SRC_TC(n) (0x4 + n) +# define NV40_FP_OP_INPUT_SRC_FACING 0xE +#define NV40_FP_OP_TEX_UNIT_SHIFT 17 +#define NV40_FP_OP_TEX_UNIT_MASK (0xF << 17) +#define NV40_FP_OP_PRECISION_SHIFT 22 +#define NV40_FP_OP_PRECISION_MASK (3 << 22) +# define NV40_FP_PRECISION_FP32 0 +# define NV40_FP_PRECISION_FP16 1 +# define NV40_FP_PRECISION_FX12 2 +#define NV40_FP_OP_OPCODE_SHIFT 24 +#define NV40_FP_OP_OPCODE_MASK (0x3F << 24) +# define NV40_FP_OP_OPCODE_NOP 0x00 +# define NV40_FP_OP_OPCODE_MOV 0x01 +# define NV40_FP_OP_OPCODE_MUL 0x02 +# define NV40_FP_OP_OPCODE_ADD 0x03 +# define NV40_FP_OP_OPCODE_MAD 0x04 +# define NV40_FP_OP_OPCODE_DP3 0x05 +# define NV40_FP_OP_OPCODE_DP4 0x06 +# define NV40_FP_OP_OPCODE_DST 0x07 +# define NV40_FP_OP_OPCODE_MIN 0x08 +# define NV40_FP_OP_OPCODE_MAX 0x09 +# define NV40_FP_OP_OPCODE_SLT 0x0A +# define NV40_FP_OP_OPCODE_SGE 0x0B +# define NV40_FP_OP_OPCODE_SLE 0x0C +# define NV40_FP_OP_OPCODE_SGT 0x0D +# define NV40_FP_OP_OPCODE_SNE 0x0E +# define NV40_FP_OP_OPCODE_SEQ 0x0F +# define NV40_FP_OP_OPCODE_FRC 0x10 +# define NV40_FP_OP_OPCODE_FLR 0x11 +# define NV40_FP_OP_OPCODE_KIL 0x12 +# define NV40_FP_OP_OPCODE_PK4B 0x13 +# define NV40_FP_OP_OPCODE_UP4B 0x14 +/* DDX/DDY can only write to XY */ +# define NV40_FP_OP_OPCODE_DDX 0x15 +# define NV40_FP_OP_OPCODE_DDY 0x16 +# define NV40_FP_OP_OPCODE_TEX 0x17 +# define NV40_FP_OP_OPCODE_TXP 0x18 +# define NV40_FP_OP_OPCODE_TXD 0x19 +# define NV40_FP_OP_OPCODE_RCP 0x1A +# define NV40_FP_OP_OPCODE_EX2 0x1C +# define NV40_FP_OP_OPCODE_LG2 0x1D +# define NV40_FP_OP_OPCODE_STR 0x20 +# define NV40_FP_OP_OPCODE_SFL 0x21 +# define NV40_FP_OP_OPCODE_COS 0x22 +# define NV40_FP_OP_OPCODE_SIN 0x23 +# define NV40_FP_OP_OPCODE_PK2H 0x24 +# define NV40_FP_OP_OPCODE_UP2H 0x25 +# define NV40_FP_OP_OPCODE_PK4UB 0x27 +# define NV40_FP_OP_OPCODE_UP4UB 0x28 +# define NV40_FP_OP_OPCODE_PK2US 0x29 +# define NV40_FP_OP_OPCODE_UP2US 0x2A +# define NV40_FP_OP_OPCODE_DP2A 0x2E +# define NV40_FP_OP_OPCODE_TXL 0x2F +# define NV40_FP_OP_OPCODE_TXB 0x31 +# define NV40_FP_OP_OPCODE_DIV 0x3A +# define NV40_FP_OP_OPCODE_UNK_LIT 0x3C +/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/ +# define NV40_FP_OP_BRA_OPCODE_BRK 0x0 +# define NV40_FP_OP_BRA_OPCODE_CAL 0x1 +# define NV40_FP_OP_BRA_OPCODE_IF 0x2 +# define NV40_FP_OP_BRA_OPCODE_LOOP 0x3 +# define NV40_FP_OP_BRA_OPCODE_REP 0x4 +# define NV40_FP_OP_BRA_OPCODE_RET 0x5 +#define NV40_FP_OP_OUT_SAT (1 << 31) + +/* high order bits of SRC0 */ +#define NV40_FP_OP_OUT_ABS (1 << 29) +#define NV40_FP_OP_COND_SWZ_W_SHIFT 27 +#define NV40_FP_OP_COND_SWZ_W_MASK (3 << 27) +#define NV40_FP_OP_COND_SWZ_Z_SHIFT 25 +#define NV40_FP_OP_COND_SWZ_Z_MASK (3 << 25) +#define NV40_FP_OP_COND_SWZ_Y_SHIFT 23 +#define NV40_FP_OP_COND_SWZ_Y_MASK (3 << 23) +#define NV40_FP_OP_COND_SWZ_X_SHIFT 21 +#define NV40_FP_OP_COND_SWZ_X_MASK (3 << 21) +#define NV40_FP_OP_COND_SWZ_ALL_SHIFT 21 +#define NV40_FP_OP_COND_SWZ_ALL_MASK (0xFF << 21) +#define NV40_FP_OP_COND_SHIFT 18 +#define NV40_FP_OP_COND_MASK (0x07 << 18) +# define NV40_FP_OP_COND_FL 0 +# define NV40_FP_OP_COND_LT 1 +# define NV40_FP_OP_COND_EQ 2 +# define NV40_FP_OP_COND_LE 3 +# define NV40_FP_OP_COND_GT 4 +# define NV40_FP_OP_COND_NE 5 +# define NV40_FP_OP_COND_GE 6 +# define NV40_FP_OP_COND_TR 7 + +/* high order bits of SRC1 */ +#define NV40_FP_OP_OPCODE_IS_BRANCH (1<<31) +#define NV40_FP_OP_DST_SCALE_SHIFT 28 +#define NV40_FP_OP_DST_SCALE_MASK (3 << 28) +#define NV40_FP_OP_DST_SCALE_1X 0 +#define NV40_FP_OP_DST_SCALE_2X 1 +#define NV40_FP_OP_DST_SCALE_4X 2 +#define NV40_FP_OP_DST_SCALE_8X 3 +#define NV40_FP_OP_DST_SCALE_INV_2X 5 +#define NV40_FP_OP_DST_SCALE_INV_4X 6 +#define NV40_FP_OP_DST_SCALE_INV_8X 7 + +/* SRC1 LOOP */ +#define NV40_FP_OP_LOOP_INCR_SHIFT 19 +#define NV40_FP_OP_LOOP_INCR_MASK (0xFF << 19) +#define NV40_FP_OP_LOOP_INDEX_SHIFT 10 +#define NV40_FP_OP_LOOP_INDEX_MASK (0xFF << 10) +#define NV40_FP_OP_LOOP_COUNT_SHIFT 2 +#define NV40_FP_OP_LOOP_COUNT_MASK (0xFF << 2) + +/* SRC1 IF */ +#define NV40_FP_OP_ELSE_ID_SHIFT 2 +#define NV40_FP_OP_ELSE_ID_MASK (0xFF << 2) + +/* SRC1 CAL */ +#define NV40_FP_OP_IADDR_SHIFT 2 +#define NV40_FP_OP_IADDR_MASK (0xFF << 2) + +/* SRC1 REP + * I have no idea why there are 3 count values here.. but they + * have always been filled with the same value in my tests so + * far.. + */ +#define NV40_FP_OP_REP_COUNT1_SHIFT 2 +#define NV40_FP_OP_REP_COUNT1_MASK (0xFF << 2) +#define NV40_FP_OP_REP_COUNT2_SHIFT 10 +#define NV40_FP_OP_REP_COUNT2_MASK (0xFF << 10) +#define NV40_FP_OP_REP_COUNT3_SHIFT 19 +#define NV40_FP_OP_REP_COUNT3_MASK (0xFF << 19) + +/* SRC2 REP/IF */ +#define NV40_FP_OP_END_ID_SHIFT 2 +#define NV40_FP_OP_END_ID_MASK (0xFF << 2) + +// SRC2 high-order +#define NV40_FP_OP_INDEX_INPUT (1 << 30) +#define NV40_FP_OP_ADDR_INDEX_SHIFT 19 +#define NV40_FP_OP_ADDR_INDEX_MASK (0xF << 19) + +//== Register selection == +#define NV40_FP_REG_TYPE_SHIFT 0 +#define NV40_FP_REG_TYPE_MASK (3 << 0) +# define NV40_FP_REG_TYPE_TEMP 0 +# define NV40_FP_REG_TYPE_INPUT 1 +# define NV40_FP_REG_TYPE_CONST 2 +#define NV40_FP_REG_SRC_SHIFT 2 +#define NV40_FP_REG_SRC_MASK (63 << 2) +#define NV40_FP_REG_SRC_HALF (1 << 8) +#define NV40_FP_REG_SWZ_ALL_SHIFT 9 +#define NV40_FP_REG_SWZ_ALL_MASK (255 << 9) +#define NV40_FP_REG_SWZ_X_SHIFT 9 +#define NV40_FP_REG_SWZ_X_MASK (3 << 9) +#define NV40_FP_REG_SWZ_Y_SHIFT 11 +#define NV40_FP_REG_SWZ_Y_MASK (3 << 11) +#define NV40_FP_REG_SWZ_Z_SHIFT 13 +#define NV40_FP_REG_SWZ_Z_MASK (3 << 13) +#define NV40_FP_REG_SWZ_W_SHIFT 15 +#define NV40_FP_REG_SWZ_W_MASK (3 << 15) +# define NV40_FP_SWIZZLE_X 0 +# define NV40_FP_SWIZZLE_Y 1 +# define NV40_FP_SWIZZLE_Z 2 +# define NV40_FP_SWIZZLE_W 3 +#define NV40_FP_REG_NEGATE (1 << 17) + +#define NV40SR_NONE 0 +#define NV40SR_OUTPUT 1 +#define NV40SR_INPUT 2 +#define NV40SR_TEMP 3 +#define NV40SR_CONST 4 + +struct nv40_sreg { + int type; + int index; + + int dst_scale; + + int negate; + int abs; + int swz[4]; + + int cc_update; + int cc_update_reg; + int cc_test; + int cc_test_reg; + int cc_swz[4]; +}; + +static INLINE struct nv40_sreg +nv40_sr(int type, int index) +{ + struct nv40_sreg temp = { + .type = type, + .index = index, + .dst_scale = DEF_SCALE, + .abs = 0, + .negate = 0, + .swz = { 0, 1, 2, 3 }, + .cc_update = 0, + .cc_update_reg = 0, + .cc_test = DEF_CTEST, + .cc_test_reg = 0, + .cc_swz = { 0, 1, 2, 3 }, + }; + return temp; +} + +static INLINE struct nv40_sreg +nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w) +{ + struct nv40_sreg dst = src; + + dst.swz[SWZ_X] = src.swz[x]; + dst.swz[SWZ_Y] = src.swz[y]; + dst.swz[SWZ_Z] = src.swz[z]; + dst.swz[SWZ_W] = src.swz[w]; + return dst; +} + +static INLINE struct nv40_sreg +nv40_sr_neg(struct nv40_sreg src) +{ + src.negate = !src.negate; + return src; +} + +static INLINE struct nv40_sreg +nv40_sr_abs(struct nv40_sreg src) +{ + src.abs = 1; + return src; +} + +static INLINE struct nv40_sreg +nv40_sr_scale(struct nv40_sreg src, int scale) +{ + src.dst_scale = scale; + return src; +} + +#endif -- cgit v1.2.3 From e1cf3f00e546f814effd25e9ccd072c941366444 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Thu, 13 Mar 2008 18:29:56 +1100 Subject: nv40: simple swtnl path (half broken, but getting there) --- src/gallium/drivers/nv40/nv40_context.c | 6 +- src/gallium/drivers/nv40/nv40_context.h | 25 +- src/gallium/drivers/nv40/nv40_draw.c | 334 +++++++++++++++++++++++-- src/gallium/drivers/nv40/nv40_fragprog.c | 3 +- src/gallium/drivers/nv40/nv40_shader.h | 2 + src/gallium/drivers/nv40/nv40_state.c | 27 +- src/gallium/drivers/nv40/nv40_state.h | 4 + src/gallium/drivers/nv40/nv40_state_clip.c | 8 +- src/gallium/drivers/nv40/nv40_state_emit.c | 130 +++++++--- src/gallium/drivers/nv40/nv40_state_viewport.c | 45 +++- src/gallium/drivers/nv40/nv40_vbo.c | 22 +- src/gallium/drivers/nv40/nv40_vertprog.c | 16 +- 12 files changed, 531 insertions(+), 91 deletions(-) (limited to 'src/gallium/drivers/nv40/nv40_shader.h') diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c index 203c843a01..58627443b8 100644 --- a/src/gallium/drivers/nv40/nv40_context.c +++ b/src/gallium/drivers/nv40/nv40_context.c @@ -74,8 +74,12 @@ nv40_create(struct pipe_screen *pscreen, unsigned pctx_id) nv40_init_state_functions(nv40); nv40_init_miptree_functions(nv40); + /* Create, configure, and install fallback swtnl path */ nv40->draw = draw_create(); - assert(nv40->draw); + draw_wide_point_threshold(nv40->draw, 9999999.0); + draw_wide_line_threshold(nv40->draw, 9999999.0); + draw_enable_line_stipple(nv40->draw, FALSE); + draw_enable_point_sprites(nv40->draw, FALSE); draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40)); return &nv40->pipe; diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h index 100c678187..02ca20b801 100644 --- a/src/gallium/drivers/nv40/nv40_context.h +++ b/src/gallium/drivers/nv40/nv40_context.h @@ -116,7 +116,20 @@ struct nv40_context { /* HW state derived from pipe states */ struct nv40_state state; - unsigned fallback; + struct { + struct nv40_vertex_program *vertprog; + + unsigned nr_attribs; + unsigned hw[PIPE_MAX_SHADER_INPUTS]; + unsigned draw[PIPE_MAX_SHADER_INPUTS]; + unsigned emit[PIPE_MAX_SHADER_INPUTS]; + } swtnl; + + enum { + HW, SWTNL, SWRAST + } render_mode; + unsigned fallback_swtnl; + unsigned fallback_swrast; /* Context state */ unsigned dirty; @@ -166,6 +179,10 @@ extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen); /* nv40_draw.c */ extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40); +extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe, + struct pipe_buffer *idxbuf, + unsigned ib_size, unsigned mode, + unsigned start, unsigned count); /* nv40_vertprog.c */ extern void nv40_vertprog_destroy(struct nv40_context *, @@ -179,8 +196,9 @@ extern void nv40_fragprog_destroy(struct nv40_context *, extern void nv40_fragtex_bind(struct nv40_context *); /* nv40_state.c and friends */ -extern void nv40_emit_hw_state(struct nv40_context *nv40); -extern void nv40_state_tex_update(struct nv40_context *nv40); +extern boolean nv40_state_validate(struct nv40_context *nv40); +extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40); +extern void nv40_state_emit(struct nv40_context *nv40); extern struct nv40_state_entry nv40_state_clip; extern struct nv40_state_entry nv40_state_rasterizer; extern struct nv40_state_entry nv40_state_scissor; @@ -194,6 +212,7 @@ extern struct nv40_state_entry nv40_state_viewport; extern struct nv40_state_entry nv40_state_framebuffer; extern struct nv40_state_entry nv40_state_fragtex; extern struct nv40_state_entry nv40_state_vbo; +extern struct nv40_state_entry nv40_state_vtxfmt; /* nv40_vbo.c */ extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode, diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c index a39bb85e99..ce0e0bc6f2 100644 --- a/src/gallium/drivers/nv40/nv40_draw.c +++ b/src/gallium/drivers/nv40/nv40_draw.c @@ -1,62 +1,350 @@ -#include "draw/draw_private.h" #include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" + +#include "draw/draw_context.h" +#include "draw/draw_vertex.h" +#include "draw/draw_private.h" #include "nv40_context.h" +#define NV40_SHADER_NO_FUCKEDNESS +#include "nv40_shader.h" + +/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very + * often at all. Uses "quadro style" vertex submission + a fixed vertex + * layout to avoid the need to generate a vertex program or vtxfmt. + */ -struct nv40_draw_stage { - struct draw_stage draw; +struct nv40_render_stage { + struct draw_stage stage; struct nv40_context *nv40; + unsigned prim; }; +static INLINE struct nv40_render_stage * +nv40_render_stage(struct draw_stage *stage) +{ + return (struct nv40_render_stage *)stage; +} + +static INLINE void +nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v) +{ + unsigned i; + + for (i = 0; i < nv40->swtnl.nr_attribs; i++) { + unsigned idx = nv40->swtnl.draw[i]; + unsigned hw = nv40->swtnl.hw[i]; + + switch (nv40->swtnl.emit[i]) { + case EMIT_OMIT: + break; + case EMIT_1F: + BEGIN_RING(curie, 0x1e40 + (hw * 4), 1); + OUT_RING (fui(v->data[idx][0])); + break; + case EMIT_2F: + BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2); + OUT_RING (fui(v->data[idx][0])); + OUT_RING (fui(v->data[idx][1])); + break; + case EMIT_3F: + BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3); + OUT_RING (fui(v->data[idx][0])); + OUT_RING (fui(v->data[idx][1])); + OUT_RING (fui(v->data[idx][2])); + break; + case EMIT_4F: + BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4); + OUT_RING (fui(v->data[idx][0])); + OUT_RING (fui(v->data[idx][1])); + OUT_RING (fui(v->data[idx][2])); + OUT_RING (fui(v->data[idx][3])); + break; + case EMIT_4UB: + BEGIN_RING(curie, 0x1940 + (hw * 4), 1); + OUT_RING (pack_ub4(float_to_ubyte(v->data[idx][0]), + float_to_ubyte(v->data[idx][1]), + float_to_ubyte(v->data[idx][2]), + float_to_ubyte(v->data[idx][3]))); + break; + default: + assert(0); + break; + } + } +} + +static INLINE void +nv40_render_prim(struct draw_stage *stage, struct prim_header *prim, + unsigned mode, unsigned count) +{ + struct nv40_render_stage *rs = nv40_render_stage(stage); + struct nv40_context *nv40 = rs->nv40; + struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf; + unsigned i; + + /* Ensure there's room for 4xfloat32 + potentially 3 begin/end */ + if (pb->remaining < ((count * 20) + 6)) { + if (rs->prim != NV40TCL_BEGIN_END_STOP) { + NOUVEAU_ERR("AIII, missed flush\n"); + assert(0); + } + FIRE_RING(); + nv40_state_emit(nv40); + } + + /* Switch primitive modes if necessary */ + if (rs->prim != mode) { + if (rs->prim != NV40TCL_BEGIN_END_STOP) { + BEGIN_RING(curie, NV40TCL_BEGIN_END, 1); + OUT_RING (NV40TCL_BEGIN_END_STOP); + } + + BEGIN_RING(curie, NV40TCL_BEGIN_END, 1); + OUT_RING (mode); + rs->prim = mode; + } + + /* Emit vertex data */ + for (i = 0; i < count; i++) + nv40_render_vertex(nv40, prim->v[i]); + + /* If it's likely we'll need to empty the push buffer soon, finish + * off the primitive now. + */ + if (pb->remaining < ((count * 20) + 6)) { + BEGIN_RING(curie, NV40TCL_BEGIN_END, 1); + OUT_RING (NV40TCL_BEGIN_END_STOP); + rs->prim = NV40TCL_BEGIN_END_STOP; + } +} + static void -nv40_draw_point(struct draw_stage *draw, struct prim_header *prim) +nv40_render_point(struct draw_stage *draw, struct prim_header *prim) { - NOUVEAU_ERR("\n"); + nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_POINTS, 1); } static void -nv40_draw_line(struct draw_stage *draw, struct prim_header *prim) +nv40_render_line(struct draw_stage *draw, struct prim_header *prim) { - NOUVEAU_ERR("\n"); + nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_LINES, 2); } static void -nv40_draw_tri(struct draw_stage *draw, struct prim_header *prim) +nv40_render_tri(struct draw_stage *draw, struct prim_header *prim) { - NOUVEAU_ERR("\n"); + nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_TRIANGLES, 3); } static void -nv40_draw_flush(struct draw_stage *draw, unsigned flags) +nv40_render_flush(struct draw_stage *draw, unsigned flags) { + struct nv40_render_stage *rs = nv40_render_stage(draw); + struct nv40_context *nv40 = rs->nv40; + + if (rs->prim != NV40TCL_BEGIN_END_STOP) { + BEGIN_RING(curie, NV40TCL_BEGIN_END, 1); + OUT_RING (NV40TCL_BEGIN_END_STOP); + rs->prim = NV40TCL_BEGIN_END_STOP; + } } static void -nv40_draw_reset_stipple_counter(struct draw_stage *draw) +nv40_render_reset_stipple_counter(struct draw_stage *draw) { - NOUVEAU_ERR("\n"); } static void -nv40_draw_destroy(struct draw_stage *draw) +nv40_render_destroy(struct draw_stage *draw) { free(draw); } +static INLINE void +emit_mov(struct nv40_vertex_program *vp, + unsigned dst, unsigned src, unsigned vor, unsigned mask) +{ + struct nv40_vertex_program_exec *inst; + + vp->insns = realloc(vp->insns, + sizeof(struct nv40_vertex_program_exec) * + ++vp->nr_insns); + inst = &vp->insns[vp->nr_insns - 1]; + + inst->data[0] = 0x401f9c6c; + inst->data[1] = 0x0040000d | (src << 8); + inst->data[2] = 0x8106c083; + inst->data[3] = 0x6041ff80 | (dst << 2) | (mask << 13); + inst->const_index = -1; + inst->has_branch_offset = FALSE; + + vp->ir |= (1 << src); + if (vor != ~0) + vp->or |= (1 << vor); +} + +static struct nv40_vertex_program * +create_drawvp(struct nv40_context *nv40) +{ + struct nv40_vertex_program *vp = CALLOC_STRUCT(nv40_vertex_program); + unsigned i; + + emit_mov(vp, NV40_VP_INST_DEST_POS, 0, ~0, 0xf); + emit_mov(vp, NV40_VP_INST_DEST_COL0, 3, 0, 0xf); + emit_mov(vp, NV40_VP_INST_DEST_COL1, 4, 1, 0xf); + emit_mov(vp, NV40_VP_INST_DEST_BFC0, 3, 2, 0xf); + emit_mov(vp, NV40_VP_INST_DEST_BFC1, 4, 3, 0xf); + emit_mov(vp, NV40_VP_INST_DEST_FOGC, 5, 4, 0x8); + for (i = 0; i < 8; i++) + emit_mov(vp, NV40_VP_INST_DEST_TC(i), 8 + i, 14 + i, 0xf); + + vp->insns[vp->nr_insns - 1].data[3] |= 1; + vp->translated = TRUE; + return vp; +} + struct draw_stage * nv40_draw_render_stage(struct nv40_context *nv40) { - struct nv40_draw_stage *nv40draw = CALLOC_STRUCT(nv40_draw_stage); + struct nv40_render_stage *render = CALLOC_STRUCT(nv40_render_stage); + + if (!nv40->swtnl.vertprog) + nv40->swtnl.vertprog = create_drawvp(nv40); + + render->nv40 = nv40; + render->stage.draw = nv40->draw; + render->stage.point = nv40_render_point; + render->stage.line = nv40_render_line; + render->stage.tri = nv40_render_tri; + render->stage.flush = nv40_render_flush; + render->stage.reset_stipple_counter = nv40_render_reset_stipple_counter; + render->stage.destroy = nv40_render_destroy; + + return &render->stage; +} + +boolean +nv40_draw_elements_swtnl(struct pipe_context *pipe, + struct pipe_buffer *idxbuf, unsigned idxbuf_size, + unsigned mode, unsigned start, unsigned count) +{ + struct nv40_context *nv40 = nv40_context(pipe); + struct pipe_winsys *ws = pipe->winsys; + unsigned i; + void *map; + + if (!nv40_state_validate_swtnl(nv40)) + return FALSE; + nv40_state_emit(nv40); - nv40draw->nv40 = nv40; - nv40draw->draw.draw = nv40->draw; - nv40draw->draw.point = nv40_draw_point; - nv40draw->draw.line = nv40_draw_line; - nv40draw->draw.tri = nv40_draw_tri; - nv40draw->draw.flush = nv40_draw_flush; - nv40draw->draw.reset_stipple_counter = nv40_draw_reset_stipple_counter; - nv40draw->draw.destroy = nv40_draw_destroy; + for (i = 0; i < PIPE_ATTRIB_MAX; i++) { + if (!nv40->vtxbuf[i].buffer) + continue; + map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer, + PIPE_BUFFER_USAGE_CPU_READ); + draw_set_mapped_vertex_buffer(nv40->draw, i, map); + } - return &nv40draw->draw; + if (idxbuf) { + map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ); + draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map); + } else { + draw_set_mapped_element_buffer(nv40->draw, 0, NULL); + } + + if (nv40->constbuf[PIPE_SHADER_VERTEX]) { + map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX], + PIPE_BUFFER_USAGE_CPU_READ); + draw_set_mapped_constant_buffer(nv40->draw, map); + } + + draw_arrays(nv40->draw, mode, start, count); + + for (i = 0; i < PIPE_ATTRIB_MAX; i++) { + if (!nv40->vtxbuf[i].buffer) + continue; + ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer); + } + + if (idxbuf) + ws->buffer_unmap(ws, idxbuf); + + if (nv40->constbuf[PIPE_SHADER_VERTEX]) + ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]); + + draw_flush(nv40->draw); + pipe->flush(pipe, 0); + + return TRUE; } +static INLINE void +emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit, + unsigned semantic, unsigned index) +{ + unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index); + unsigned a = nv40->swtnl.nr_attribs++; + + nv40->swtnl.hw[a] = hw; + nv40->swtnl.emit[a] = emit; + nv40->swtnl.draw[a] = draw_out; +} + +static boolean +nv40_state_vtxfmt_validate(struct nv40_context *nv40) +{ + struct nv40_fragment_program *fp = nv40->fragprog; + unsigned colour = 0, texcoords = 0, fog = 0, i; + + /* Determine needed fragprog inputs */ + for (i = 0; i < fp->info.num_inputs; i++) { + switch (fp->info.input_semantic_name[i]) { + case TGSI_SEMANTIC_POSITION: + break; + case TGSI_SEMANTIC_COLOR: + colour |= (1 << fp->info.input_semantic_index[i]); + break; + case TGSI_SEMANTIC_GENERIC: + texcoords |= (1 << fp->info.input_semantic_index[i]); + break; + case TGSI_SEMANTIC_FOG: + fog = 1; + break; + default: + assert(0); + } + } + + nv40->swtnl.nr_attribs = 0; + + /* Map draw vtxprog output to hw attribute IDs */ + for (i = 0; i < 2; i++) { + if (!(colour & (1 << i))) + continue; + emit_attrib(nv40, 3 + i, EMIT_4UB, TGSI_SEMANTIC_COLOR, i); + } + + for (i = 0; i < 8; i++) { + if (!(texcoords & (1 << i))) + continue; + emit_attrib(nv40, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i); + } + + if (fog) { + emit_attrib(nv40, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0); + } + + emit_attrib(nv40, 0, EMIT_4F, TGSI_SEMANTIC_POSITION, 0); + + return FALSE; +} + +struct nv40_state_entry nv40_state_vtxfmt = { + .validate = nv40_state_vtxfmt_validate, + .dirty = { + .pipe = NV40_NEW_ARRAYS | NV40_NEW_FRAGPROG, + .hw = 0 + } +}; + diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c index 953f9cd908..82dbcd3eef 100644 --- a/src/gallium/drivers/nv40/nv40_fragprog.c +++ b/src/gallium/drivers/nv40/nv40_fragprog.c @@ -797,9 +797,10 @@ nv40_fragprog_validate(struct nv40_context *nv40) if (fp->translated) goto update_constants; + nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG; nv40_fragprog_translate(nv40, fp); if (!fp->translated) { - nv40->fallback |= NV40_FALLBACK_RAST; + nv40->fallback_swrast |= NV40_NEW_FRAGPROG; return FALSE; } diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h index 5909c70713..854dccf548 100644 --- a/src/gallium/drivers/nv40/nv40_shader.h +++ b/src/gallium/drivers/nv40/nv40_shader.h @@ -476,6 +476,7 @@ # define NV40_FP_SWIZZLE_W 3 #define NV40_FP_REG_NEGATE (1 << 17) +#ifndef NV40_SHADER_NO_FUCKEDNESS #define NV40SR_NONE 0 #define NV40SR_OUTPUT 1 #define NV40SR_INPUT 2 @@ -550,5 +551,6 @@ nv40_sr_scale(struct nv40_sreg src, int scale) src.dst_scale = scale; return src; } +#endif #endif diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c index 321d5de041..3eafbece30 100644 --- a/src/gallium/drivers/nv40/nv40_state.c +++ b/src/gallium/drivers/nv40/nv40_state.c @@ -3,6 +3,8 @@ #include "pipe/p_util.h" #include "pipe/p_inlines.h" +#include "draw/draw_context.h" + #include "nv40_context.h" #include "nv40_state.h" @@ -345,7 +347,7 @@ nv40_rasterizer_state_create(struct pipe_context *pipe, so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK); break; default: - so_data(so, 0); + so_data(so, NV40TCL_CULL_FACE_BACK); break; } so_data(so, NV40TCL_FRONT_FACE_CCW); @@ -363,13 +365,13 @@ nv40_rasterizer_state_create(struct pipe_context *pipe, so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK); break; default: - so_data(so, 0); + so_data(so, NV40TCL_CULL_FACE_BACK); break; } so_data(so, NV40TCL_FRONT_FACE_CW); } so_data(so, cso->poly_smooth ? 1 : 0); - so_data(so, cso->cull_mode != PIPE_WINDING_NONE ? 1 : 0); + so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0); so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1); so_data (so, cso->poly_stipple_enable ? 1 : 0); @@ -419,6 +421,9 @@ static void nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso) { struct nv40_context *nv40 = nv40_context(pipe); + struct nv40_rasterizer_state *rsso = hwcso; + + draw_set_rasterizer_state(nv40->draw, &rsso->pipe); nv40->rasterizer = hwcso; nv40->dirty |= NV40_NEW_RAST; @@ -508,10 +513,12 @@ static void * nv40_vp_state_create(struct pipe_context *pipe, const struct pipe_shader_state *cso) { + struct nv40_context *nv40 = nv40_context(pipe); struct nv40_vertex_program *vp; vp = CALLOC(1, sizeof(struct nv40_vertex_program)); vp->pipe = *cso; + vp->draw = draw_create_vertex_shader(nv40->draw, &vp->pipe); return (void *)vp; } @@ -520,6 +527,9 @@ static void nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso) { struct nv40_context *nv40 = nv40_context(pipe); + struct nv40_vertex_program *vp = hwcso; + + draw_bind_vertex_shader(nv40->draw, vp ? vp->draw : NULL); nv40->vertprog = hwcso; nv40->dirty |= NV40_NEW_VERTPROG; @@ -531,6 +541,7 @@ nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso) struct nv40_context *nv40 = nv40_context(pipe); struct nv40_vertex_program *vp = hwcso; + draw_delete_vertex_shader(nv40->draw, vp->draw); nv40_vertprog_destroy(nv40, vp); FREE(vp); } @@ -544,6 +555,8 @@ nv40_fp_state_create(struct pipe_context *pipe, fp = CALLOC(1, sizeof(struct nv40_fragment_program)); fp->pipe = *cso; + tgsi_scan_shader(fp->pipe.tokens, &fp->info); + return (void *)fp; } @@ -582,6 +595,8 @@ nv40_set_clip_state(struct pipe_context *pipe, { struct nv40_context *nv40 = nv40_context(pipe); + draw_set_clip_state(nv40->draw, clip); + nv40->clip = *clip; nv40->dirty |= NV40_NEW_UCP; } @@ -638,6 +653,8 @@ nv40_set_viewport_state(struct pipe_context *pipe, { struct nv40_context *nv40 = nv40_context(pipe); + draw_set_viewport_state(nv40->draw, vpt); + nv40->viewport = *vpt; nv40->dirty |= NV40_NEW_VIEWPORT; } @@ -648,6 +665,8 @@ nv40_set_vertex_buffer(struct pipe_context *pipe, unsigned index, { struct nv40_context *nv40 = nv40_context(pipe); + draw_set_vertex_buffer(nv40->draw, index, vb); + nv40->vtxbuf[index] = *vb; nv40->dirty |= NV40_NEW_ARRAYS; } @@ -658,6 +677,8 @@ nv40_set_vertex_element(struct pipe_context *pipe, unsigned index, { struct nv40_context *nv40 = nv40_context(pipe); + draw_set_vertex_element(nv40->draw, index, ve); + nv40->vtxelt[index] = *ve; nv40->dirty |= NV40_NEW_ARRAYS; } diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h index a02ea0c878..ab2866eb7a 100644 --- a/src/gallium/drivers/nv40/nv40_state.h +++ b/src/gallium/drivers/nv40/nv40_state.h @@ -2,6 +2,7 @@ #define __NV40_STATE_H__ #include "pipe/p_state.h" +#include "tgsi/util/tgsi_scan.h" struct nv40_sampler_state { uint32_t fmt; @@ -25,6 +26,8 @@ struct nv40_vertex_program_data { struct nv40_vertex_program { struct pipe_shader_state pipe; + struct draw_vertex_shader *draw; + boolean translated; struct nv40_vertex_program_exec *insns; unsigned nr_insns; @@ -49,6 +52,7 @@ struct nv40_fragment_program_data { struct nv40_fragment_program { struct pipe_shader_state pipe; + struct tgsi_shader_info info; boolean translated; unsigned samplers; diff --git a/src/gallium/drivers/nv40/nv40_state_clip.c b/src/gallium/drivers/nv40/nv40_state_clip.c index 93e690161f..c52390f9ed 100644 --- a/src/gallium/drivers/nv40/nv40_state_clip.c +++ b/src/gallium/drivers/nv40/nv40_state_clip.c @@ -3,8 +3,12 @@ static boolean nv40_state_clip_validate(struct nv40_context *nv40) { - if (nv40->clip.nr) - nv40->fallback |= NV40_FALLBACK_TNL; + + if (nv40->render_mode == HW) { + nv40->fallback_swtnl &= ~NV40_NEW_UCP; + if (nv40->clip.nr) + nv40->fallback_swtnl |= NV40_NEW_UCP; + } return FALSE; } diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c index 9f268640e0..056238cc83 100644 --- a/src/gallium/drivers/nv40/nv40_state_emit.c +++ b/src/gallium/drivers/nv40/nv40_state_emit.c @@ -1,5 +1,6 @@ #include "nv40_context.h" #include "nv40_state.h" +#include "draw/draw_context.h" static struct nv40_state_entry *render_states[] = { &nv40_state_framebuffer, @@ -18,15 +19,27 @@ static struct nv40_state_entry *render_states[] = { NULL }; +static struct nv40_state_entry *swtnl_states[] = { + &nv40_state_framebuffer, + &nv40_state_rasterizer, + &nv40_state_clip, + &nv40_state_scissor, + &nv40_state_stipple, + &nv40_state_fragprog, + &nv40_state_fragtex, + &nv40_state_vertprog, + &nv40_state_blend, + &nv40_state_blend_colour, + &nv40_state_zsa, + &nv40_state_viewport, + &nv40_state_vtxfmt, + NULL +}; + static void -nv40_state_validate(struct nv40_context *nv40) +nv40_state_do_validate(struct nv40_context *nv40, + struct nv40_state_entry **states) { - struct nv40_state_entry **states = render_states; - unsigned last_fallback; - - last_fallback = nv40->fallback; - nv40->fallback = 0; - while (*states) { struct nv40_state_entry *e = *states; @@ -38,32 +51,15 @@ nv40_state_validate(struct nv40_context *nv40) states++; } nv40->dirty = 0; - - if (nv40->fallback & NV40_FALLBACK_TNL && - !(last_fallback & NV40_FALLBACK_TNL)) { - NOUVEAU_ERR("XXX: hwtnl->swtnl\n"); - } else - if (last_fallback & NV40_FALLBACK_TNL && - !(nv40->fallback & NV40_FALLBACK_TNL)) { - NOUVEAU_ERR("XXX: swtnl->hwtnl\n"); - } - - if (nv40->fallback & NV40_FALLBACK_RAST && - !(last_fallback & NV40_FALLBACK_RAST)) { - NOUVEAU_ERR("XXX: hwrast->swrast\n"); - } else - if (last_fallback & NV40_FALLBACK_RAST && - !(nv40->fallback & NV40_FALLBACK_RAST)) { - NOUVEAU_ERR("XXX: swrast->hwrast\n"); - } } -static void +void nv40_state_emit(struct nv40_context *nv40) { struct nv40_state *state = &nv40->state; struct nv40_screen *screen = nv40->screen; unsigned i, samplers; + uint64 states; if (nv40->pctx_id != screen->cur_pctx) { for (i = 0; i < NV40_STATE_MAX; i++) { @@ -74,14 +70,24 @@ nv40_state_emit(struct nv40_context *nv40) screen->cur_pctx = nv40->pctx_id; } - while (state->dirty) { - unsigned idx = ffsll(state->dirty) - 1; + for (i = 0, states = state->dirty; states; i++) { + if (!(states & (1ULL << i))) + continue; + so_ref (state->hw[i], &nv40->screen->state[i]); + so_emit(nv40->nvws, nv40->screen->state[i]); + states &= ~(1ULL << i); + } - so_ref (state->hw[idx], &nv40->screen->state[idx]); - so_emit(nv40->nvws, nv40->screen->state[idx]); - state->dirty &= ~(1ULL << idx); + if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) | + (1ULL << NV40_STATE_FRAGTEX0))) { + BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); + OUT_RING (2); + BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); + OUT_RING (1); } + state->dirty = 0; + so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]); for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) { if (!(samplers & (1 << i))) @@ -91,18 +97,62 @@ nv40_state_emit(struct nv40_context *nv40) samplers &= ~(1ULL << i); } so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]); - so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]); + if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW) + so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]); } -void -nv40_emit_hw_state(struct nv40_context *nv40) +boolean +nv40_state_validate(struct nv40_context *nv40) { - nv40_state_validate(nv40); - nv40_state_emit(nv40); + boolean was_sw = nv40->fallback_swtnl ? TRUE : FALSE; + + if (nv40->render_mode != HW) { + /* Don't even bother trying to go back to hw if none + * of the states that caused swtnl previously have changed. + */ + if ((nv40->fallback_swtnl & nv40->dirty) + != nv40->fallback_swtnl) + return FALSE; + + /* Attempt to go to hwtnl again */ + nv40->pipe.flush(&nv40->pipe, 0); + nv40->dirty |= (NV40_NEW_VIEWPORT | + NV40_NEW_VERTPROG | + NV40_NEW_ARRAYS | + NV40_NEW_UCP); + nv40->render_mode = HW; + } + + nv40_state_do_validate(nv40, render_states); + if (nv40->fallback_swtnl || nv40->fallback_swrast) + return FALSE; + + if (was_sw) + NOUVEAU_ERR("swtnl->hw\n"); + + return TRUE; +} + +boolean +nv40_state_validate_swtnl(struct nv40_context *nv40) +{ + /* Setup for swtnl */ + if (nv40->render_mode == HW) { + NOUVEAU_ERR("hw->swtnl 0x%08x\n", nv40->fallback_swtnl); + nv40->pipe.flush(&nv40->pipe, 0); + nv40->dirty |= (NV40_NEW_VIEWPORT | + NV40_NEW_VERTPROG | + NV40_NEW_ARRAYS | + NV40_NEW_UCP); + nv40->render_mode = SWTNL; + } + + nv40_state_do_validate(nv40, swtnl_states); + if (nv40->fallback_swrast) { + NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nv40->fallback_swrast); + return FALSE; + } - BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); - OUT_RING (2); - BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); - OUT_RING (1); + return TRUE; } diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c index 3a32533907..9e5c7a72a7 100644 --- a/src/gallium/drivers/nv40/nv40_state_viewport.c +++ b/src/gallium/drivers/nv40/nv40_state_viewport.c @@ -3,18 +3,43 @@ static boolean nv40_state_viewport_validate(struct nv40_context *nv40) { - struct nouveau_stateobj *so = so_new(9, 0); + struct nouveau_stateobj *so = so_new(11, 0); struct pipe_viewport_state *vpt = &nv40->viewport; - so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_TRANSLATE_X, 8); - so_data (so, fui(vpt->translate[0])); - so_data (so, fui(vpt->translate[1])); - so_data (so, fui(vpt->translate[2])); - so_data (so, fui(vpt->translate[3])); - so_data (so, fui(vpt->scale[0])); - so_data (so, fui(vpt->scale[1])); - so_data (so, fui(vpt->scale[2])); - so_data (so, fui(vpt->scale[3])); + if (nv40->render_mode == HW) { + so_method(so, nv40->screen->curie, + NV40TCL_VIEWPORT_TRANSLATE_X, 8); + so_data (so, fui(vpt->translate[0])); + so_data (so, fui(vpt->translate[1])); + so_data (so, fui(vpt->translate[2])); + so_data (so, fui(vpt->translate[3])); + so_data (so, fui(vpt->scale[0])); + so_data (so, fui(vpt->scale[1])); + so_data (so, fui(vpt->scale[2])); + so_data (so, fui(vpt->scale[3])); + so_method(so, nv40->screen->curie, 0x1d78, 1); + so_data (so, 1); + } else { + so_method(so, nv40->screen->curie, + NV40TCL_VIEWPORT_TRANSLATE_X, 8); + so_data (so, fui(0.0)); + so_data (so, fui(0.0)); + so_data (so, fui(0.0)); + so_data (so, fui(0.0)); + so_data (so, fui(1.0)); + so_data (so, fui(1.0)); + so_data (so, fui(1.0)); + so_data (so, fui(0.0)); + /* Not entirely certain what this is yet. The DDX uses this + * value also as it fixes rendering when you pass + * pre-transformed vertices to the GPU. My best gusss is that + * this bypasses some culling/clipping stage. Might be worth + * noting that points/lines are uneffected by whatever this + * value fixes, only filled polygons are effected. + */ + so_method(so, nv40->screen->curie, 0x1d78, 1); + so_data (so, 0x110); + } so_ref(so, &nv40->state.hw[NV40_STATE_VIEWPORT]); return TRUE; diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c index f16afc23b8..fad423fdf8 100644 --- a/src/gallium/drivers/nv40/nv40_vbo.c +++ b/src/gallium/drivers/nv40/nv40_vbo.c @@ -8,6 +8,8 @@ #include "nouveau/nouveau_channel.h" #include "nouveau/nouveau_pushbuf.h" +#define FORCE_SWTNL 0 + static INLINE int nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp) { @@ -165,7 +167,11 @@ nv40_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start, unsigned nr; nv40_vbo_set_idxbuf(nv40, NULL, 0); - nv40_emit_hw_state(nv40); + if (FORCE_SWTNL || !nv40_state_validate(nv40)) { + return nv40_draw_elements_swtnl(pipe, NULL, 0, + mode, start, count); + } + nv40_state_emit(nv40); BEGIN_RING(curie, NV40TCL_BEGIN_END, 1); OUT_RING (nvgl_primitive(mode)); @@ -274,7 +280,7 @@ nv40_draw_elements_inline(struct pipe_context *pipe, struct pipe_winsys *ws = pipe->winsys; void *map; - nv40_emit_hw_state(nv40); + nv40_state_emit(nv40); map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ); if (!ib) { @@ -315,7 +321,7 @@ nv40_draw_elements_vbo(struct pipe_context *pipe, struct nv40_context *nv40 = nv40_context(pipe); unsigned nr; - nv40_emit_hw_state(nv40); + nv40_state_emit(nv40); BEGIN_RING(curie, NV40TCL_BEGIN_END, 1); OUT_RING (nvgl_primitive(mode)); @@ -352,8 +358,16 @@ nv40_draw_elements(struct pipe_context *pipe, unsigned mode, unsigned start, unsigned count) { struct nv40_context *nv40 = nv40_context(pipe); + boolean idxbuf; + + idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize); + if (FORCE_SWTNL || !nv40_state_validate(nv40)) { + return nv40_draw_elements_swtnl(pipe, NULL, 0, + mode, start, count); + } + nv40_state_emit(nv40); - if (nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize)) { + if (idxbuf) { nv40_draw_elements_vbo(pipe, mode, start, count); } else { nv40_draw_elements_inline(pipe, indexBuffer, indexSize, diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c index 3d730c1a32..9f1ee575ce 100644 --- a/src/gallium/drivers/nv40/nv40_vertprog.c +++ b/src/gallium/drivers/nv40/nv40_vertprog.c @@ -634,21 +634,29 @@ out_err: static boolean nv40_vertprog_validate(struct nv40_context *nv40) { - struct nv40_vertex_program *vp = nv40->vertprog; - struct pipe_buffer *constbuf = - nv40->constbuf[PIPE_SHADER_VERTEX]; struct nouveau_winsys *nvws = nv40->nvws; struct pipe_winsys *ws = nv40->pipe.winsys; + struct nv40_vertex_program *vp; + struct pipe_buffer *constbuf; boolean upload_code = FALSE, upload_data = FALSE; int i; + if (nv40->render_mode == HW) { + vp = nv40->vertprog; + constbuf = nv40->constbuf[PIPE_SHADER_VERTEX]; + } else { + vp = nv40->swtnl.vertprog; + constbuf = NULL; + } + /* Translate TGSI shader into hw bytecode */ if (vp->translated) goto check_gpu_resources; + nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG; nv40_vertprog_translate(nv40, vp); if (!vp->translated) { - nv40->fallback |= NV40_FALLBACK_TNL; + nv40->fallback_swtnl |= NV40_NEW_VERTPROG; return FALSE; } -- cgit v1.2.3