summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/nv50/nv50_program.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/nv50/nv50_program.c')
-rw-r--r--src/gallium/drivers/nv50/nv50_program.c2482
1 files changed, 2482 insertions, 0 deletions
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
new file mode 100644
index 0000000000..4a838529de
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -0,0 +1,2482 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv50_context.h"
+
+#define NV50_SU_MAX_TEMP 64
+//#define NV50_PROGRAM_DUMP
+
+/* ARL - gallium craps itself on progs/vp/arl.txt
+ *
+ * MSB - Like MAD, but MUL+SUB
+ * - Fuck it off, introduce a way to negate args for ops that
+ * support it.
+ *
+ * Look into inlining IMMD for ops other than MOV (make it general?)
+ * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
+ * but can emit to P_TEMP first - then MOV later. NVIDIA does this
+ *
+ * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
+ * case, if the emit_src() causes the inst to suddenly become long.
+ *
+ * Verify half-insns work where expected - and force disable them where they
+ * don't work - MUL has it forcibly disabled atm as it fixes POW..
+ *
+ * FUCK! watch dst==src vectors, can overwrite components that are needed.
+ * ie. SUB R0, R0.yzxw, R0
+ *
+ * Things to check with renouveau:
+ * FP attr/result assignment - how?
+ * attrib
+ * - 0x16bc maps vp output onto fp hpos
+ * - 0x16c0 maps vp output onto fp col0
+ * result
+ * - colr always 0-3
+ * - depr always 4
+ * 0x16bc->0x16e8 --> some binding between vp/fp regs
+ * 0x16b8 --> VP output count
+ *
+ * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
+ * "MOV rcol.x, fcol.y" = 0x00000004
+ * 0x19a8 --> as above but 0x00000100 and 0x00000000
+ * - 0x00100000 used when KIL used
+ * 0x196c --> as above but 0x00000011 and 0x00000000
+ *
+ * 0x1988 --> 0xXXNNNNNN
+ * - XX == FP high something
+ */
+struct nv50_reg {
+ enum {
+ P_TEMP,
+ P_ATTR,
+ P_RESULT,
+ P_CONST,
+ P_IMMD
+ } type;
+ int index;
+
+ int hw;
+ int neg;
+
+ int rhw; /* result hw for FP outputs, or interpolant index */
+ int acc; /* instruction where this reg is last read (first insn == 1) */
+};
+
+struct nv50_pc {
+ struct nv50_program *p;
+
+ /* hw resources */
+ struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
+
+ /* tgsi resources */
+ struct nv50_reg *temp;
+ int temp_nr;
+ struct nv50_reg *attr;
+ int attr_nr;
+ struct nv50_reg *result;
+ int result_nr;
+ struct nv50_reg *param;
+ int param_nr;
+ struct nv50_reg *immd;
+ float *immd_buf;
+ int immd_nr;
+
+ struct nv50_reg *temp_temp[16];
+ unsigned temp_temp_nr;
+
+ unsigned interp_mode[32];
+ /* perspective interpolation registers */
+ struct nv50_reg *iv_p;
+ struct nv50_reg *iv_c;
+
+ /* current instruction and total number of insns */
+ unsigned insn_cur;
+ unsigned insn_nr;
+
+ boolean allow32;
+};
+
+static void
+alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+ int i = 0;
+
+ if (reg->type == P_RESULT) {
+ if (pc->p->cfg.high_result < (reg->hw + 1))
+ pc->p->cfg.high_result = reg->hw + 1;
+ }
+
+ if (reg->type != P_TEMP)
+ return;
+
+ if (reg->hw >= 0) {
+ /*XXX: do this here too to catch FP temp-as-attr usage..
+ * not clean, but works */
+ if (pc->p->cfg.high_temp < (reg->hw + 1))
+ pc->p->cfg.high_temp = reg->hw + 1;
+ return;
+ }
+
+ if (reg->rhw != -1) {
+ /* try to allocate temporary with index rhw first */
+ if (!(pc->r_temp[reg->rhw])) {
+ pc->r_temp[reg->rhw] = reg;
+ reg->hw = reg->rhw;
+ if (pc->p->cfg.high_temp < (reg->rhw + 1))
+ pc->p->cfg.high_temp = reg->rhw + 1;
+ return;
+ }
+ /* make sure we don't get things like $r0 needs to go
+ * in $r1 and $r1 in $r0
+ */
+ i = pc->result_nr * 4;
+ }
+
+ for (; i < NV50_SU_MAX_TEMP; i++) {
+ if (!(pc->r_temp[i])) {
+ pc->r_temp[i] = reg;
+ reg->hw = i;
+ if (pc->p->cfg.high_temp < (i + 1))
+ pc->p->cfg.high_temp = i + 1;
+ return;
+ }
+ }
+
+ assert(0);
+}
+
+static struct nv50_reg *
+alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
+{
+ struct nv50_reg *r;
+ int i;
+
+ if (dst && dst->type == P_TEMP && dst->hw == -1)
+ return dst;
+
+ for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+ if (!pc->r_temp[i]) {
+ r = CALLOC_STRUCT(nv50_reg);
+ r->type = P_TEMP;
+ r->index = -1;
+ r->hw = i;
+ r->rhw = -1;
+ pc->r_temp[i] = r;
+ return r;
+ }
+ }
+
+ assert(0);
+ return NULL;
+}
+
+/* Assign the hw of the discarded temporary register src
+ * to the tgsi register dst and free src.
+ */
+static void
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+ assert(src->index == -1 && src->hw != -1);
+
+ if (dst->hw != -1)
+ pc->r_temp[dst->hw] = NULL;
+ pc->r_temp[src->hw] = dst;
+ dst->hw = src->hw;
+
+ FREE(src);
+}
+
+/* release the hardware resource held by r */
+static void
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)
+{
+ assert(r->type == P_TEMP);
+ if (r->hw == -1)
+ return;
+
+ assert(pc->r_temp[r->hw] == r);
+ pc->r_temp[r->hw] = NULL;
+
+ r->acc = 0;
+ if (r->index == -1)
+ FREE(r);
+}
+
+static void
+free_temp(struct nv50_pc *pc, struct nv50_reg *r)
+{
+ if (r->index == -1) {
+ unsigned hw = r->hw;
+
+ FREE(pc->r_temp[hw]);
+ pc->r_temp[hw] = NULL;
+ }
+}
+
+static int
+alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
+{
+ int i;
+
+ if ((idx + 4) >= NV50_SU_MAX_TEMP)
+ return 1;
+
+ if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
+ pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
+ return alloc_temp4(pc, dst, idx + 4);
+
+ for (i = 0; i < 4; i++) {
+ dst[i] = CALLOC_STRUCT(nv50_reg);
+ dst[i]->type = P_TEMP;
+ dst[i]->index = -1;
+ dst[i]->hw = idx + i;
+ pc->r_temp[idx + i] = dst[i];
+ }
+
+ return 0;
+}
+
+static void
+free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ free_temp(pc, reg[i]);
+}
+
+static struct nv50_reg *
+temp_temp(struct nv50_pc *pc)
+{
+ if (pc->temp_temp_nr >= 16)
+ assert(0);
+
+ pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+ return pc->temp_temp[pc->temp_temp_nr++];
+}
+
+static void
+kill_temp_temp(struct nv50_pc *pc)
+{
+ int i;
+
+ for (i = 0; i < pc->temp_temp_nr; i++)
+ free_temp(pc, pc->temp_temp[i]);
+ pc->temp_temp_nr = 0;
+}
+
+static int
+ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+ pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
+ (pc->immd_nr + 1) * 4 * sizeof(float));
+ pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
+ pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
+ pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
+ pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
+
+ return pc->immd_nr++;
+}
+
+static struct nv50_reg *
+alloc_immd(struct nv50_pc *pc, float f)
+{
+ struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+ unsigned hw;
+
+ for (hw = 0; hw < pc->immd_nr * 4; hw++)
+ if (pc->immd_buf[hw] == f)
+ break;
+
+ if (hw == pc->immd_nr * 4)
+ hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
+
+ r->type = P_IMMD;
+ r->hw = hw;
+ r->index = -1;
+ return r;
+}
+
+static struct nv50_program_exec *
+exec(struct nv50_pc *pc)
+{
+ struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
+
+ e->param.index = -1;
+ return e;
+}
+
+static void
+emit(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+ struct nv50_program *p = pc->p;
+
+ if (p->exec_tail)
+ p->exec_tail->next = e;
+ if (!p->exec_head)
+ p->exec_head = e;
+ p->exec_tail = e;
+ p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+}
+
+static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
+
+static boolean
+is_long(struct nv50_program_exec *e)
+{
+ if (e->inst[0] & 1)
+ return TRUE;
+ return FALSE;
+}
+
+static boolean
+is_immd(struct nv50_program_exec *e)
+{
+ if (is_long(e) && (e->inst[1] & 3) == 3)
+ return TRUE;
+ return FALSE;
+}
+
+static INLINE void
+set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
+ struct nv50_program_exec *e)
+{
+ set_long(pc, e);
+ e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
+ e->inst[1] |= (pred << 7) | (idx << 12);
+}
+
+static INLINE void
+set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
+ struct nv50_program_exec *e)
+{
+ set_long(pc, e);
+ e->inst[1] &= ~((0x3 << 4) | (1 << 6));
+ e->inst[1] |= (idx << 4) | (on << 6);
+}
+
+static INLINE void
+set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+ if (is_long(e))
+ return;
+
+ e->inst[0] |= 1;
+ set_pred(pc, 0xf, 0, e);
+ set_pred_wr(pc, 0, 0, e);
+}
+
+static INLINE void
+set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
+{
+ if (dst->type == P_RESULT) {
+ set_long(pc, e);
+ e->inst[1] |= 0x00000008;
+ }
+
+ alloc_reg(pc, dst);
+ e->inst[0] |= (dst->hw << 2);
+}
+
+static INLINE void
+set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
+{
+ float f = pc->immd_buf[imm->hw];
+ unsigned val = fui(imm->neg ? -f : f);
+
+ set_long(pc, e);
+ /*XXX: can't be predicated - bits overlap.. catch cases where both
+ * are required and avoid them. */
+ set_pred(pc, 0, 0, e);
+ set_pred_wr(pc, 0, 0, e);
+
+ e->inst[1] |= 0x00000002 | 0x00000001;
+ e->inst[0] |= (val & 0x3f) << 16;
+ e->inst[1] |= (val >> 6) << 2;
+}
+
+
+#define INTERP_LINEAR 0
+#define INTERP_FLAT 1
+#define INTERP_PERSPECTIVE 2
+#define INTERP_CENTROID 4
+
+/* interpolant index has been stored in dst->rhw */
+static void
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
+ unsigned mode)
+{
+ assert(dst->rhw != -1);
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0x80000000;
+ set_dst(pc, dst, e);
+ e->inst[0] |= (dst->rhw << 16);
+
+ if (mode & INTERP_FLAT) {
+ e->inst[0] |= (1 << 8);
+ } else {
+ if (mode & INTERP_PERSPECTIVE) {
+ e->inst[0] |= (1 << 25);
+ alloc_reg(pc, iv);
+ e->inst[0] |= (iv->hw << 9);
+ }
+
+ if (mode & INTERP_CENTROID)
+ e->inst[0] |= (1 << 24);
+ }
+
+ emit(pc, e);
+}
+
+static void
+set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
+ struct nv50_program_exec *e)
+{
+ set_long(pc, e);
+
+ e->param.index = src->hw;
+ e->param.shift = s;
+ e->param.mask = m << (s % 32);
+
+ e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
+}
+
+static void
+emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0x10000000;
+
+ set_dst(pc, dst, e);
+
+ if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
+ set_immd(pc, src, e);
+ /*XXX: 32-bit, but steals part of "half" reg space - need to
+ * catch and handle this case if/when we do half-regs
+ */
+ } else
+ if (src->type == P_IMMD || src->type == P_CONST) {
+ set_long(pc, e);
+ set_data(pc, src, 0x7f, 9, e);
+ e->inst[1] |= 0x20000000; /* src0 const? */
+ } else {
+ if (src->type == P_ATTR) {
+ set_long(pc, e);
+ e->inst[1] |= 0x00200000;
+ }
+
+ alloc_reg(pc, src);
+ e->inst[0] |= (src->hw << 9);
+ }
+
+ if (is_long(e) && !is_immd(e)) {
+ e->inst[1] |= 0x04000000; /* 32-bit */
+ e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
+ if (!(e->inst[1] & 0x20000000))
+ e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
+ } else
+ e->inst[0] |= 0x00008000;
+
+ emit(pc, e);
+}
+
+static INLINE void
+emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
+{
+ struct nv50_reg *imm = alloc_immd(pc, f);
+ emit_mov(pc, dst, imm);
+ FREE(imm);
+}
+
+static boolean
+check_swap_src_0_1(struct nv50_pc *pc,
+ struct nv50_reg **s0, struct nv50_reg **s1)
+{
+ struct nv50_reg *src0 = *s0, *src1 = *s1;
+
+ if (src0->type == P_CONST) {
+ if (src1->type != P_CONST) {
+ *s0 = src1;
+ *s1 = src0;
+ return TRUE;
+ }
+ } else
+ if (src1->type == P_ATTR) {
+ if (src0->type != P_ATTR) {
+ *s0 = src1;
+ *s1 = src0;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static void
+set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+ if (src->type == P_ATTR) {
+ set_long(pc, e);
+ e->inst[1] |= 0x00200000;
+ } else
+ if (src->type == P_CONST || src->type == P_IMMD) {
+ struct nv50_reg *temp = temp_temp(pc);
+
+ emit_mov(pc, temp, src);
+ src = temp;
+ }
+
+ alloc_reg(pc, src);
+ e->inst[0] |= (src->hw << 9);
+}
+
+static void
+set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+ if (src->type == P_ATTR) {
+ struct nv50_reg *temp = temp_temp(pc);
+
+ emit_mov(pc, temp, src);
+ src = temp;
+ } else
+ if (src->type == P_CONST || src->type == P_IMMD) {
+ assert(!(e->inst[0] & 0x00800000));
+ if (e->inst[0] & 0x01000000) {
+ struct nv50_reg *temp = temp_temp(pc);
+
+ emit_mov(pc, temp, src);
+ src = temp;
+ } else {
+ set_data(pc, src, 0x7f, 16, e);
+ e->inst[0] |= 0x00800000;
+ }
+ }
+
+ alloc_reg(pc, src);
+ e->inst[0] |= (src->hw << 16);
+}
+
+static void
+set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+ set_long(pc, e);
+
+ if (src->type == P_ATTR) {
+ struct nv50_reg *temp = temp_temp(pc);
+
+ emit_mov(pc, temp, src);
+ src = temp;
+ } else
+ if (src->type == P_CONST || src->type == P_IMMD) {
+ assert(!(e->inst[0] & 0x01000000));
+ if (e->inst[0] & 0x00800000) {
+ struct nv50_reg *temp = temp_temp(pc);
+
+ emit_mov(pc, temp, src);
+ src = temp;
+ } else {
+ set_data(pc, src, 0x7f, 32+14, e);
+ e->inst[0] |= 0x01000000;
+ }
+ }
+
+ alloc_reg(pc, src);
+ e->inst[1] |= (src->hw << 14);
+}
+
+static void
+emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+ struct nv50_reg *src1)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0xc0000000;
+
+ if (!pc->allow32)
+ set_long(pc, e);
+
+ check_swap_src_0_1(pc, &src0, &src1);
+ set_dst(pc, dst, e);
+ set_src_0(pc, src0, e);
+ if (src1->type == P_IMMD && !is_long(e)) {
+ if (src0->neg)
+ e->inst[0] |= 0x00008000;
+ set_immd(pc, src1, e);
+ } else {
+ set_src_1(pc, src1, e);
+ if (src0->neg ^ src1->neg) {
+ if (is_long(e))
+ e->inst[1] |= 0x08000000;
+ else
+ e->inst[0] |= 0x00008000;
+ }
+ }
+
+ emit(pc, e);
+}
+
+static void
+emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
+ struct nv50_reg *src0, struct nv50_reg *src1)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0xb0000000;
+
+ check_swap_src_0_1(pc, &src0, &src1);
+
+ if (!pc->allow32 || src0->neg || src1->neg) {
+ set_long(pc, e);
+ e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
+ }
+
+ set_dst(pc, dst, e);
+ set_src_0(pc, src0, e);
+ if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
+ set_src_2(pc, src1, e);
+ else
+ if (src1->type == P_IMMD)
+ set_immd(pc, src1, e);
+ else
+ set_src_1(pc, src1, e);
+
+ emit(pc, e);
+}
+
+static void
+emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
+ struct nv50_reg *src0, struct nv50_reg *src1)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ set_long(pc, e);
+ e->inst[0] |= 0xb0000000;
+ e->inst[1] |= (sub << 29);
+
+ check_swap_src_0_1(pc, &src0, &src1);
+ set_dst(pc, dst, e);
+ set_src_0(pc, src0, e);
+ set_src_1(pc, src1, e);
+
+ emit(pc, e);
+}
+
+static INLINE void
+emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+ struct nv50_reg *src1)
+{
+ src1->neg ^= 1;
+ emit_add(pc, dst, src0, src1);
+ src1->neg ^= 1;
+}
+
+static void
+emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+ struct nv50_reg *src1, struct nv50_reg *src2)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0xe0000000;
+
+ check_swap_src_0_1(pc, &src0, &src1);
+ set_dst(pc, dst, e);
+ set_src_0(pc, src0, e);
+ set_src_1(pc, src1, e);
+ set_src_2(pc, src2, e);
+
+ if (src0->neg ^ src1->neg)
+ e->inst[1] |= 0x04000000;
+ if (src2->neg)
+ e->inst[1] |= 0x08000000;
+
+ emit(pc, e);
+}
+
+static INLINE void
+emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+ struct nv50_reg *src1, struct nv50_reg *src2)
+{
+ src2->neg ^= 1;
+ emit_mad(pc, dst, src0, src1, src2);
+ src2->neg ^= 1;
+}
+
+static void
+emit_flop(struct nv50_pc *pc, unsigned sub,
+ struct nv50_reg *dst, struct nv50_reg *src)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0x90000000;
+ if (sub) {
+ set_long(pc, e);
+ e->inst[1] |= (sub << 29);
+ }
+
+ set_dst(pc, dst, e);
+ set_src_0(pc, src, e);
+
+ emit(pc, e);
+}
+
+static void
+emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0xb0000000;
+
+ set_dst(pc, dst, e);
+ set_src_0(pc, src, e);
+ set_long(pc, e);
+ e->inst[1] |= (6 << 29) | 0x00004000;
+
+ emit(pc, e);
+}
+
+static void
+emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ e->inst[0] |= 0xb0000000;
+
+ set_dst(pc, dst, e);
+ set_src_0(pc, src, e);
+ set_long(pc, e);
+ e->inst[1] |= (6 << 29);
+
+ emit(pc, e);
+}
+
+#define CVTOP_RN 0x01
+#define CVTOP_FLOOR 0x03
+#define CVTOP_CEIL 0x05
+#define CVTOP_TRUNC 0x07
+#define CVTOP_SAT 0x08
+#define CVTOP_ABS 0x10
+
+#define CVT_F32_F32 0xc4
+#define CVT_F32_S32 0x44
+#define CVT_F32_U32 0x64
+#define CVT_S32_F32 0x8c
+#define CVT_S32_S32 0x0c
+#define CVT_F32_F32_ROP 0xcc
+
+static void
+emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+ int wp, unsigned cop, unsigned fmt)
+{
+ struct nv50_program_exec *e;
+
+ e = exec(pc);
+ set_long(pc, e);
+
+ e->inst[0] |= 0xa0000000;
+ e->inst[1] |= 0x00004000;
+ e->inst[1] |= (cop << 16);
+ e->inst[1] |= (fmt << 24);
+ set_src_0(pc, src, e);
+
+ if (wp >= 0)
+ set_pred_wr(pc, 1, wp, e);
+
+ if (dst)
+ set_dst(pc, dst, e);
+ else {
+ e->inst[0] |= 0x000001fc;
+ e->inst[1] |= 0x00000008;
+ }
+
+ emit(pc, e);
+}
+
+static void
+emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+ struct nv50_reg *src0, struct nv50_reg *src1)
+{
+ struct nv50_program_exec *e = exec(pc);
+ unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+ struct nv50_reg *rdst;
+
+ assert(c_op <= 7);
+ if (check_swap_src_0_1(pc, &src0, &src1))
+ c_op = inv_cop[c_op];
+
+ rdst = dst;
+ if (dst->type != P_TEMP)
+ dst = alloc_temp(pc, NULL);
+
+ /* set.u32 */
+ set_long(pc, e);
+ e->inst[0] |= 0xb0000000;
+ e->inst[1] |= (3 << 29);
+ e->inst[1] |= (c_op << 14);
+ /*XXX: breaks things, .u32 by default?
+ * decuda will disasm as .u16 and use .lo/.hi regs, but this
+ * doesn't seem to match what the hw actually does.
+ inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+ */
+ set_dst(pc, dst, e);
+ set_src_0(pc, src0, e);
+ set_src_1(pc, src1, e);
+ emit(pc, e);
+
+ /* cvt.f32.u32 */
+ e = exec(pc);
+ e->inst[0] = 0xa0000001;
+ e->inst[1] = 0x64014780;
+ set_dst(pc, rdst, e);
+ set_src_0(pc, dst, e);
+ emit(pc, e);
+
+ if (dst != rdst)
+ free_temp(pc, dst);
+}
+
+static INLINE void
+emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+ emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
+}
+
+static void
+emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
+ struct nv50_reg *v, struct nv50_reg *e)
+{
+ struct nv50_reg *temp = alloc_temp(pc, NULL);
+
+ emit_flop(pc, 3, temp, v);
+ emit_mul(pc, temp, temp, e);
+ emit_preex2(pc, temp, temp);
+ emit_flop(pc, 6, dst, temp);
+
+ free_temp(pc, temp);
+}
+
+static INLINE void
+emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+ emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
+}
+
+static void
+emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+ struct nv50_reg **src)
+{
+ struct nv50_reg *one = alloc_immd(pc, 1.0);
+ struct nv50_reg *zero = alloc_immd(pc, 0.0);
+ struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
+ struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
+ struct nv50_reg *tmp[4];
+ boolean allow32 = pc->allow32;
+
+ pc->allow32 = FALSE;
+
+ if (mask & (3 << 1)) {
+ tmp[0] = alloc_temp(pc, NULL);
+ emit_minmax(pc, 4, tmp[0], src[0], zero);
+ }
+
+ if (mask & (1 << 2)) {
+ set_pred_wr(pc, 1, 0, pc->p->exec_tail);
+
+ tmp[1] = temp_temp(pc);
+ emit_minmax(pc, 4, tmp[1], src[1], zero);
+
+ tmp[3] = temp_temp(pc);
+ emit_minmax(pc, 4, tmp[3], src[3], neg128);
+ emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+
+ emit_pow(pc, dst[2], tmp[1], tmp[3]);
+ emit_mov(pc, dst[2], zero);
+ set_pred(pc, 3, 0, pc->p->exec_tail);
+ }
+
+ if (mask & (1 << 1))
+ assimilate_temp(pc, dst[1], tmp[0]);
+ else
+ if (mask & (1 << 2))
+ free_temp(pc, tmp[0]);
+
+ pc->allow32 = allow32;
+
+ /* do this last, in case src[i,j] == dst[0,3] */
+ if (mask & (1 << 0))
+ emit_mov(pc, dst[0], one);
+
+ if (mask & (1 << 3))
+ emit_mov(pc, dst[3], one);
+
+ FREE(pos128);
+ FREE(neg128);
+ FREE(zero);
+ FREE(one);
+}
+
+static void
+emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+ struct nv50_program_exec *e = exec(pc);
+
+ set_long(pc, e);
+ e->inst[0] |= 0xa0000000; /* delta */
+ e->inst[1] |= (7 << 29); /* delta */
+ e->inst[1] |= 0x04000000; /* negate arg0? probably not */
+ e->inst[1] |= (1 << 14); /* src .f32 */
+ set_dst(pc, dst, e);
+ set_src_0(pc, src, e);
+
+ emit(pc, e);
+}
+
+static void
+emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
+{
+ struct nv50_program_exec *e;
+ const int r_pred = 1;
+
+ /* Sets predicate reg ? */
+ e = exec(pc);
+ e->inst[0] = 0xa00001fd;
+ e->inst[1] = 0xc4014788;
+ set_src_0(pc, src, e);
+ set_pred_wr(pc, 1, r_pred, e);
+ if (src->neg)
+ e->inst[1] |= 0x20000000;
+ emit(pc, e);
+
+ /* This is probably KILP */
+ e = exec(pc);
+ e->inst[0] = 0x000001fe;
+ set_long(pc, e);
+ set_pred(pc, 1 /* LT? */, r_pred, e);
+ emit(pc, e);
+}
+
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+ struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
+{
+ struct nv50_reg *temp, *t[4];
+ struct nv50_program_exec *e;
+
+ unsigned c, mode, dim;
+
+ switch (type) {
+ case TGSI_TEXTURE_1D:
+ dim = 1;
+ break;
+ case TGSI_TEXTURE_UNKNOWN:
+ case TGSI_TEXTURE_2D:
+ case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
+ case TGSI_TEXTURE_RECT:
+ dim = 2;
+ break;
+ case TGSI_TEXTURE_3D:
+ case TGSI_TEXTURE_CUBE:
+ case TGSI_TEXTURE_SHADOW2D:
+ case TGSI_TEXTURE_SHADOWRECT: /* XXX */
+ dim = 3;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ /* some cards need t[0]'s hw index to be a multiple of 4 */
+ alloc_temp4(pc, t, 0);
+
+ if (proj) {
+ if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+ mode = pc->interp_mode[src[0]->index];
+
+ t[3]->rhw = src[3]->rhw;
+ emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+ emit_flop(pc, 0, t[3], t[3]);
+
+ for (c = 0; c < dim; c++) {
+ t[c]->rhw = src[c]->rhw;
+ emit_interp(pc, t[c], t[3],
+ (mode | INTERP_PERSPECTIVE));
+ }
+ } else {
+ emit_flop(pc, 0, t[3], src[3]);
+ for (c = 0; c < dim; c++)
+ emit_mul(pc, t[c], src[c], t[3]);
+
+ /* XXX: for some reason the blob sometimes uses MAD:
+ * emit_mad(pc, t[c], src[0][c], t[3], t[3])
+ * pc->p->exec_tail->inst[1] |= 0x080fc000;
+ */
+ }
+ } else {
+ if (type == TGSI_TEXTURE_CUBE) {
+ temp = temp_temp(pc);
+ emit_minmax(pc, 4, temp, src[0], src[1]);
+ emit_minmax(pc, 4, temp, temp, src[2]);
+ emit_flop(pc, 0, temp, temp);
+ for (c = 0; c < 3; c++)
+ emit_mul(pc, t[c], src[c], temp);
+ } else {
+ for (c = 0; c < dim; c++)
+ emit_mov(pc, t[c], src[c]);
+ }
+ }
+
+ e = exec(pc);
+ set_long(pc, e);
+ e->inst[0] |= 0xf0000000;
+ e->inst[1] |= 0x00000004;
+ set_dst(pc, t[0], e);
+ e->inst[0] |= (unit << 9);
+
+ if (dim == 2)
+ e->inst[0] |= 0x00400000;
+ else
+ if (dim == 3)
+ e->inst[0] |= 0x00800000;
+
+ e->inst[0] |= (mask & 0x3) << 25;
+ e->inst[1] |= (mask & 0xc) << 12;
+
+ emit(pc, e);
+
+#if 1
+ if (mask & 1) emit_mov(pc, dst[0], t[0]);
+ if (mask & 2) emit_mov(pc, dst[1], t[1]);
+ if (mask & 4) emit_mov(pc, dst[2], t[2]);
+ if (mask & 8) emit_mov(pc, dst[3], t[3]);
+
+ free_temp4(pc, t);
+#else
+ /* XXX: if p.e. MUL is used directly after TEX, it would still use
+ * the texture coordinates, not the fetched values: latency ? */
+
+ for (c = 0; c < 4; c++) {
+ if (mask & (1 << c))
+ assimilate_temp(pc, dst[c], t[c]);
+ else
+ free_temp(pc, t[c]);
+ }
+#endif
+}
+
+static void
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+ unsigned q = 0, m = ~0;
+
+ assert(!is_long(e));
+
+ switch (e->inst[0] >> 28) {
+ case 0x1:
+ /* MOV */
+ q = 0x0403c000;
+ m = 0xffff7fff;
+ break;
+ case 0x8:
+ /* INTERP (move centroid, perspective and flat bits) */
+ m = ~0x03000100;
+ q = (e->inst[0] & (3 << 24)) >> (24 - 16);
+ q |= (e->inst[0] & (1 << 8)) << (18 - 8);
+ break;
+ case 0x9:
+ /* RCP */
+ break;
+ case 0xB:
+ /* ADD */
+ m = ~(127 << 16);
+ q = ((e->inst[0] & (~m)) >> 2);
+ break;
+ case 0xC:
+ /* MUL */
+ m = ~0x00008000;
+ q = ((e->inst[0] & (~m)) << 12);
+ break;
+ case 0xE:
+ /* MAD (if src2 == dst) */
+ q = ((e->inst[0] & 0x1fc) << 12);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ set_long(pc, e);
+ pc->p->exec_size++;
+
+ e->inst[0] &= m;
+ e->inst[1] |= q;
+}
+
+static boolean
+negate_supported(const struct tgsi_full_instruction *insn, int i)
+{
+ switch (insn->Instruction.Opcode) {
+ case TGSI_OPCODE_DP3:
+ case TGSI_OPCODE_DP4:
+ case TGSI_OPCODE_MUL:
+ case TGSI_OPCODE_KIL:
+ case TGSI_OPCODE_ADD:
+ case TGSI_OPCODE_SUB:
+ case TGSI_OPCODE_MAD:
+ return TRUE;
+ case TGSI_OPCODE_POW:
+ return (i == 1) ? TRUE : FALSE;
+ default:
+ return FALSE;
+ }
+}
+
+static struct nv50_reg *
+tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
+{
+ switch (dst->DstRegister.File) {
+ case TGSI_FILE_TEMPORARY:
+ return &pc->temp[dst->DstRegister.Index * 4 + c];
+ case TGSI_FILE_OUTPUT:
+ return &pc->result[dst->DstRegister.Index * 4 + c];
+ case TGSI_FILE_NULL:
+ return NULL;
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+static struct nv50_reg *
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
+ boolean neg)
+{
+ struct nv50_reg *r = NULL;
+ struct nv50_reg *temp;
+ unsigned sgn, c;
+
+ sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
+
+ c = tgsi_util_get_full_src_register_extswizzle(src, chan);
+ switch (c) {
+ case TGSI_EXTSWIZZLE_X:
+ case TGSI_EXTSWIZZLE_Y:
+ case TGSI_EXTSWIZZLE_Z:
+ case TGSI_EXTSWIZZLE_W:
+ switch (src->SrcRegister.File) {
+ case TGSI_FILE_INPUT:
+ r = &pc->attr[src->SrcRegister.Index * 4 + c];
+ break;
+ case TGSI_FILE_TEMPORARY:
+ r = &pc->temp[src->SrcRegister.Index * 4 + c];
+ break;
+ case TGSI_FILE_CONSTANT:
+ r = &pc->param[src->SrcRegister.Index * 4 + c];
+ break;
+ case TGSI_FILE_IMMEDIATE:
+ r = &pc->immd[src->SrcRegister.Index * 4 + c];
+ break;
+ case TGSI_FILE_SAMPLER:
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case TGSI_EXTSWIZZLE_ZERO:
+ r = alloc_immd(pc, 0.0);
+ return r;
+ case TGSI_EXTSWIZZLE_ONE:
+ if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
+ return alloc_immd(pc, -1.0);
+ return alloc_immd(pc, 1.0);
+ default:
+ assert(0);
+ break;
+ }
+
+ switch (sgn) {
+ case TGSI_UTIL_SIGN_KEEP:
+ break;
+ case TGSI_UTIL_SIGN_CLEAR:
+ temp = temp_temp(pc);
+ emit_abs(pc, temp, r);
+ r = temp;
+ break;
+ case TGSI_UTIL_SIGN_TOGGLE:
+ if (neg)
+ r->neg = 1;
+ else {
+ temp = temp_temp(pc);
+ emit_neg(pc, temp, r);
+ r = temp;
+ }
+ break;
+ case TGSI_UTIL_SIGN_SET:
+ temp = temp_temp(pc);
+ emit_abs(pc, temp, r);
+ if (neg)
+ temp->neg = 1;
+ else
+ emit_neg(pc, temp, temp);
+ r = temp;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ return r;
+}
+
+/* returns TRUE if instruction can overwrite sources before they're read */
+static boolean
+direct2dest_op(const struct tgsi_full_instruction *insn)
+{
+ if (insn->Instruction.Saturate)
+ return FALSE;
+
+ switch (insn->Instruction.Opcode) {
+ case TGSI_OPCODE_COS:
+ case TGSI_OPCODE_DP3:
+ case TGSI_OPCODE_DP4:
+ case TGSI_OPCODE_DPH:
+ case TGSI_OPCODE_KIL:
+ case TGSI_OPCODE_LIT:
+ case TGSI_OPCODE_POW:
+ case TGSI_OPCODE_RCP:
+ case TGSI_OPCODE_RSQ:
+ case TGSI_OPCODE_SCS:
+ case TGSI_OPCODE_SIN:
+ case TGSI_OPCODE_TEX:
+ case TGSI_OPCODE_TXP:
+ return FALSE;
+ default:
+ return TRUE;
+ }
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+{
+ const struct tgsi_full_instruction *inst = &tok->FullInstruction;
+ struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+ unsigned mask, sat, unit;
+ boolean assimilate = FALSE;
+ int i, c;
+
+ mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+ sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
+
+ for (c = 0; c < 4; c++) {
+ if (mask & (1 << c))
+ dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
+ else
+ dst[c] = NULL;
+ rdst[c] = NULL;
+ src[0][c] = NULL;
+ src[1][c] = NULL;
+ src[2][c] = NULL;
+ }
+
+ for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+ const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+
+ if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
+ unit = fs->SrcRegister.Index;
+
+ for (c = 0; c < 4; c++)
+ src[i][c] = tgsi_src(pc, c, fs,
+ negate_supported(inst, i));
+ }
+
+ if (sat) {
+ for (c = 0; c < 4; c++) {
+ rdst[c] = dst[c];
+ dst[c] = temp_temp(pc);
+ }
+ } else
+ if (direct2dest_op(inst)) {
+ for (c = 0; c < 4; c++) {
+ if (!dst[c] || dst[c]->type != P_TEMP)
+ continue;
+
+ for (i = c + 1; i < 4; i++) {
+ if (dst[c] == src[0][i] ||
+ dst[c] == src[1][i] ||
+ dst[c] == src[2][i])
+ break;
+ }
+ if (i == 4)
+ continue;
+
+ assimilate = TRUE;
+ rdst[c] = dst[c];
+ dst[c] = alloc_temp(pc, NULL);
+ }
+ }
+
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ABS:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_abs(pc, dst[c], src[0][c]);
+ }
+ break;
+ case TGSI_OPCODE_ADD:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_add(pc, dst[c], src[0][c], src[1][c]);
+ }
+ break;
+ case TGSI_OPCODE_COS:
+ temp = temp_temp(pc);
+ emit_precossin(pc, temp, src[0][0]);
+ emit_flop(pc, 5, temp, temp);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_DP3:
+ temp = temp_temp(pc);
+ emit_mul(pc, temp, src[0][0], src[1][0]);
+ emit_mad(pc, temp, src[0][1], src[1][1], temp);
+ emit_mad(pc, temp, src[0][2], src[1][2], temp);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_DP4:
+ temp = temp_temp(pc);
+ emit_mul(pc, temp, src[0][0], src[1][0]);
+ emit_mad(pc, temp, src[0][1], src[1][1], temp);
+ emit_mad(pc, temp, src[0][2], src[1][2], temp);
+ emit_mad(pc, temp, src[0][3], src[1][3], temp);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_DPH:
+ temp = temp_temp(pc);
+ emit_mul(pc, temp, src[0][0], src[1][0]);
+ emit_mad(pc, temp, src[0][1], src[1][1], temp);
+ emit_mad(pc, temp, src[0][2], src[1][2], temp);
+ emit_add(pc, temp, src[1][3], temp);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_DST:
+ {
+ struct nv50_reg *one = alloc_immd(pc, 1.0);
+ if (mask & (1 << 0))
+ emit_mov(pc, dst[0], one);
+ if (mask & (1 << 1))
+ emit_mul(pc, dst[1], src[0][1], src[1][1]);
+ if (mask & (1 << 2))
+ emit_mov(pc, dst[2], src[0][2]);
+ if (mask & (1 << 3))
+ emit_mov(pc, dst[3], src[1][3]);
+ FREE(one);
+ }
+ break;
+ case TGSI_OPCODE_EX2:
+ temp = temp_temp(pc);
+ emit_preex2(pc, temp, src[0][0]);
+ emit_flop(pc, 6, temp, temp);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_FLR:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_flr(pc, dst[c], src[0][c]);
+ }
+ break;
+ case TGSI_OPCODE_FRC:
+ temp = temp_temp(pc);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_flr(pc, temp, src[0][c]);
+ emit_sub(pc, dst[c], src[0][c], temp);
+ }
+ break;
+ case TGSI_OPCODE_KIL:
+ emit_kil(pc, src[0][0]);
+ emit_kil(pc, src[0][1]);
+ emit_kil(pc, src[0][2]);
+ emit_kil(pc, src[0][3]);
+ pc->p->cfg.fp.regs[2] |= 0x00100000;
+ break;
+ case TGSI_OPCODE_LIT:
+ emit_lit(pc, &dst[0], mask, &src[0][0]);
+ break;
+ case TGSI_OPCODE_LG2:
+ temp = temp_temp(pc);
+ emit_flop(pc, 3, temp, src[0][0]);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_LRP:
+ temp = temp_temp(pc);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_sub(pc, temp, src[1][c], src[2][c]);
+ emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
+ }
+ break;
+ case TGSI_OPCODE_MAD:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+ }
+ break;
+ case TGSI_OPCODE_MAX:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+ }
+ break;
+ case TGSI_OPCODE_MIN:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+ }
+ break;
+ case TGSI_OPCODE_MOV:
+ case TGSI_OPCODE_SWZ:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], src[0][c]);
+ }
+ break;
+ case TGSI_OPCODE_MUL:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mul(pc, dst[c], src[0][c], src[1][c]);
+ }
+ break;
+ case TGSI_OPCODE_POW:
+ temp = temp_temp(pc);
+ emit_pow(pc, temp, src[0][0], src[1][0]);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_RCP:
+ for (c = 3; c >= 0; c--) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_flop(pc, 0, dst[c], src[0][0]);
+ }
+ break;
+ case TGSI_OPCODE_RSQ:
+ for (c = 3; c >= 0; c--) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_flop(pc, 2, dst[c], src[0][0]);
+ }
+ break;
+ case TGSI_OPCODE_SCS:
+ temp = temp_temp(pc);
+ emit_precossin(pc, temp, src[0][0]);
+ if (mask & (1 << 0))
+ emit_flop(pc, 5, dst[0], temp);
+ if (mask & (1 << 1))
+ emit_flop(pc, 4, dst[1], temp);
+ if (mask & (1 << 2))
+ emit_mov_immdval(pc, dst[2], 0.0);
+ if (mask & (1 << 3))
+ emit_mov_immdval(pc, dst[3], 1.0);
+ break;
+ case TGSI_OPCODE_SGE:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
+ }
+ break;
+ case TGSI_OPCODE_SIN:
+ temp = temp_temp(pc);
+ emit_precossin(pc, temp, src[0][0]);
+ emit_flop(pc, 4, temp, temp);
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_mov(pc, dst[c], temp);
+ }
+ break;
+ case TGSI_OPCODE_SLT:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+ }
+ break;
+ case TGSI_OPCODE_SUB:
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_sub(pc, dst[c], src[0][c], src[1][c]);
+ }
+ break;
+ case TGSI_OPCODE_TEX:
+ emit_tex(pc, dst, mask, src[0], unit,
+ inst->InstructionExtTexture.Texture, FALSE);
+ break;
+ case TGSI_OPCODE_TXP:
+ emit_tex(pc, dst, mask, src[0], unit,
+ inst->InstructionExtTexture.Texture, TRUE);
+ break;
+ case TGSI_OPCODE_XPD:
+ temp = temp_temp(pc);
+ if (mask & (1 << 0)) {
+ emit_mul(pc, temp, src[0][2], src[1][1]);
+ emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
+ }
+ if (mask & (1 << 1)) {
+ emit_mul(pc, temp, src[0][0], src[1][2]);
+ emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
+ }
+ if (mask & (1 << 2)) {
+ emit_mul(pc, temp, src[0][1], src[1][0]);
+ emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
+ }
+ if (mask & (1 << 3))
+ emit_mov_immdval(pc, dst[3], 1.0);
+ break;
+ case TGSI_OPCODE_END:
+ break;
+ default:
+ NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
+ return FALSE;
+ }
+
+ if (sat) {
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
+ CVT_F32_F32);
+ }
+ } else if (assimilate) {
+ for (c = 0; c < 4; c++)
+ if (rdst[c])
+ assimilate_temp(pc, rdst[c], dst[c]);
+ }
+
+ for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+ for (c = 0; c < 4; c++) {
+ if (!src[i][c])
+ continue;
+ if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
+ FREE(src[i][c]);
+ else
+ if (src[i][c]->acc == pc->insn_cur)
+ release_hw(pc, src[i][c]);
+ }
+ }
+
+ kill_temp_temp(pc);
+ return TRUE;
+}
+
+/* Adjust a bitmask that indicates what components of a source are used,
+ * we use this in tx_prep so we only load interpolants that are needed.
+ */
+static void
+insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
+{
+ const struct tgsi_instruction_ext_texture *tex;
+
+ switch (insn->Instruction.Opcode) {
+ case TGSI_OPCODE_DP3:
+ *mask = 0x7;
+ break;
+ case TGSI_OPCODE_DP4:
+ case TGSI_OPCODE_DPH:
+ *mask = 0xF;
+ break;
+ case TGSI_OPCODE_LIT:
+ *mask = 0xB;
+ break;
+ case TGSI_OPCODE_RCP:
+ case TGSI_OPCODE_RSQ:
+ *mask = 0x1;
+ break;
+ case TGSI_OPCODE_TEX:
+ case TGSI_OPCODE_TXP:
+ assert(insn->Instruction.Extended);
+ tex = &insn->InstructionExtTexture;
+
+ *mask = 0x7;
+ if (tex->Texture == TGSI_TEXTURE_1D)
+ *mask = 0x1;
+ else
+ if (tex->Texture == TGSI_TEXTURE_2D)
+ *mask = 0x3;
+
+ if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+ *mask |= 0x8;
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
+ unsigned *r_usage[2])
+{
+ const struct tgsi_full_instruction *insn;
+ const struct tgsi_full_src_register *src;
+ const struct tgsi_dst_register *dst;
+
+ unsigned i, c, k, n, mask, *acc_p;
+
+ insn = &tok->FullInstruction;
+ dst = &insn->FullDstRegisters[0].DstRegister;
+ mask = dst->WriteMask;
+
+ if (!r_usage[0])
+ r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
+ if (!r_usage[1])
+ r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+
+ if (dst->File == TGSI_FILE_TEMPORARY) {
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+ r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+ }
+ }
+
+ for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+ src = &insn->FullSrcRegisters[i];
+
+ switch (src->SrcRegister.File) {
+ case TGSI_FILE_TEMPORARY:
+ acc_p = r_usage[0];
+ break;
+ case TGSI_FILE_INPUT:
+ acc_p = r_usage[1];
+ break;
+ default:
+ continue;
+ }
+
+ insn_adjust_mask(insn, &mask);
+
+ for (c = 0; c < 4; c++) {
+ if (!(mask & (1 << c)))
+ continue;
+
+ k = tgsi_util_get_full_src_register_extswizzle(src, c);
+ switch (k) {
+ case TGSI_EXTSWIZZLE_X:
+ case TGSI_EXTSWIZZLE_Y:
+ case TGSI_EXTSWIZZLE_Z:
+ case TGSI_EXTSWIZZLE_W:
+ n = src->SrcRegister.Index * 4 + k;
+ acc_p[n] = pc->insn_nr;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+}
+
+static unsigned
+load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
+ int *aid, int *p_oid)
+{
+ struct nv50_reg *iv;
+ int oid, c, n;
+ unsigned mask = 0;
+
+ iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+
+ for (c = 0, n = i * 4; c < 4; c++, n++) {
+ oid = (*p_oid)++;
+ pc->attr[n].type = P_TEMP;
+ pc->attr[n].index = i;
+
+ if (pc->attr[n].acc == acc[n])
+ continue;
+ mask |= (1 << c);
+
+ pc->attr[n].acc = acc[n];
+ pc->attr[n].rhw = pc->attr[n].hw = -1;
+ alloc_reg(pc, &pc->attr[n]);
+
+ pc->attr[n].rhw = (*aid)++;
+ emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+
+ pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
+ (*mid)++;
+ pc->p->cfg.fp.regs[1] += 0x00010001;
+ }
+
+ return mask;
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+ struct tgsi_parse_context p;
+ boolean ret = FALSE;
+ unsigned i, c;
+ unsigned fcol, bcol, fcrd, depr;
+
+ /* count (centroid) perspective interpolations */
+ unsigned centroid_loads = 0;
+ unsigned perspect_loads = 0;
+
+ /* track register access for temps and attrs */
+ unsigned *r_usage[2];
+ r_usage[0] = NULL;
+ r_usage[1] = NULL;
+
+ depr = fcol = bcol = fcrd = 0xffff;
+
+ if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+ pc->p->cfg.fp.regs[0] = 0x01000404;
+ pc->p->cfg.fp.regs[1] = 0x00000400;
+ }
+
+ tgsi_parse_init(&p, pc->p->pipe.tokens);
+ while (!tgsi_parse_end_of_tokens(&p)) {
+ const union tgsi_full_token *tok = &p.FullToken;
+
+ tgsi_parse_token(&p);
+ switch (tok->Token.Type) {
+ case TGSI_TOKEN_TYPE_IMMEDIATE:
+ {
+ const struct tgsi_full_immediate *imm =
+ &p.FullToken.FullImmediate;
+
+ ctor_immd(pc, imm->u[0].Float,
+ imm->u[1].Float,
+ imm->u[2].Float,
+ imm->u[3].Float);
+ }
+ break;
+ case TGSI_TOKEN_TYPE_DECLARATION:
+ {
+ const struct tgsi_full_declaration *d;
+ unsigned last, first, mode;
+
+ d = &p.FullToken.FullDeclaration;
+ first = d->DeclarationRange.First;
+ last = d->DeclarationRange.Last;
+
+ switch (d->Declaration.File) {
+ case TGSI_FILE_TEMPORARY:
+ if (pc->temp_nr < (last + 1))
+ pc->temp_nr = last + 1;
+ break;
+ case TGSI_FILE_OUTPUT:
+ if (pc->result_nr < (last + 1))
+ pc->result_nr = last + 1;
+
+ if (!d->Declaration.Semantic)
+ break;
+
+ switch (d->Semantic.SemanticName) {
+ case TGSI_SEMANTIC_POSITION:
+ depr = first;
+ pc->p->cfg.fp.regs[2] |= 0x00000100;
+ pc->p->cfg.fp.regs[3] |= 0x00000011;
+ break;
+ default:
+ break;
+ }
+
+ break;
+ case TGSI_FILE_INPUT:
+ {
+ if (pc->attr_nr < (last + 1))
+ pc->attr_nr = last + 1;
+
+ if (pc->p->type != PIPE_SHADER_FRAGMENT)
+ break;
+
+ switch (d->Declaration.Interpolate) {
+ case TGSI_INTERPOLATE_CONSTANT:
+ mode = INTERP_FLAT;
+ break;
+ case TGSI_INTERPOLATE_PERSPECTIVE:
+ mode = INTERP_PERSPECTIVE;
+ break;
+ default:
+ mode = INTERP_LINEAR;
+ break;
+ }
+
+ if (d->Declaration.Semantic) {
+ switch (d->Semantic.SemanticName) {
+ case TGSI_SEMANTIC_POSITION:
+ fcrd = first;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ fcol = first;
+ mode = INTERP_PERSPECTIVE;
+ break;
+ case TGSI_SEMANTIC_BCOLOR:
+ bcol = first;
+ mode = INTERP_PERSPECTIVE;
+ break;
+ }
+ }
+
+ if (d->Declaration.Centroid) {
+ mode |= INTERP_CENTROID;
+ if (mode & INTERP_PERSPECTIVE)
+ centroid_loads++;
+ } else
+ if (mode & INTERP_PERSPECTIVE)
+ perspect_loads++;
+
+ assert(last < 32);
+ for (i = first; i <= last; i++)
+ pc->interp_mode[i] = mode;
+ }
+ break;
+ case TGSI_FILE_CONSTANT:
+ if (pc->param_nr < (last + 1))
+ pc->param_nr = last + 1;
+ break;
+ case TGSI_FILE_SAMPLER:
+ break;
+ default:
+ NOUVEAU_ERR("bad decl file %d\n",
+ d->Declaration.File);
+ goto out_err;
+ }
+ }
+ break;
+ case TGSI_TOKEN_TYPE_INSTRUCTION:
+ pc->insn_nr++;
+ prep_inspect_insn(pc, tok, r_usage);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (pc->temp_nr) {
+ pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
+ if (!pc->temp)
+ goto out_err;
+
+ for (i = 0; i < pc->temp_nr; i++) {
+ for (c = 0; c < 4; c++) {
+ pc->temp[i*4+c].type = P_TEMP;
+ pc->temp[i*4+c].hw = -1;
+ pc->temp[i*4+c].rhw = -1;
+ pc->temp[i*4+c].index = i;
+ pc->temp[i*4+c].acc = r_usage[0][i*4+c];
+ }
+ }
+ }
+
+ if (pc->attr_nr) {
+ int oid = 4, mid = 4, aid = 0;
+ /* oid = VP output id
+ * aid = FP attribute/interpolant id
+ * mid = VP output mapping field ID
+ */
+
+ pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
+ if (!pc->attr)
+ goto out_err;
+
+ if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+ /* position should be loaded first */
+ if (fcrd != 0xffff) {
+ unsigned mask;
+ mid = 0;
+ mask = load_fp_attrib(pc, fcrd, r_usage[1],
+ &mid, &aid, &oid);
+ oid = 0;
+ pc->p->cfg.fp.regs[1] |= (mask << 24);
+ pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
+ }
+ pc->p->cfg.fp.map[0] += 0x03020100;
+
+ /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
+
+ if (perspect_loads) {
+ pc->iv_p = alloc_temp(pc, NULL);
+
+ if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
+ pc->p->cfg.fp.regs[1] |= 0x08000000;
+ pc->iv_p->rhw = aid++;
+ emit_interp(pc, pc->iv_p, NULL,
+ INTERP_LINEAR);
+ emit_flop(pc, 0, pc->iv_p, pc->iv_p);
+ } else {
+ pc->iv_p->rhw = aid - 1;
+ emit_flop(pc, 0, pc->iv_p,
+ &pc->attr[fcrd * 4 + 3]);
+ }
+ }
+
+ if (centroid_loads) {
+ pc->iv_c = alloc_temp(pc, NULL);
+ pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
+ emit_interp(pc, pc->iv_c, NULL,
+ INTERP_CENTROID);
+ emit_flop(pc, 0, pc->iv_c, pc->iv_c);
+ pc->p->cfg.fp.regs[1] |= 0x08000000;
+ }
+
+ for (c = 0; c < 4; c++) {
+ /* I don't know what these values do, but
+ * let's set them like the blob does:
+ */
+ if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
+ pc->p->cfg.fp.regs[0] += 0x00010000;
+ if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
+ pc->p->cfg.fp.regs[0] += 0x00010000;
+ }
+
+ for (i = 0; i < pc->attr_nr; i++)
+ load_fp_attrib(pc, i, r_usage[1],
+ &mid, &aid, &oid);
+
+ if (pc->iv_p)
+ free_temp(pc, pc->iv_p);
+ if (pc->iv_c)
+ free_temp(pc, pc->iv_c);
+
+ pc->p->cfg.fp.high_map = (mid / 4);
+ pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
+ } else {
+ /* vertex program */
+ for (i = 0; i < pc->attr_nr * 4; i++) {
+ pc->p->cfg.vp.attr[aid / 32] |=
+ (1 << (aid % 32));
+ pc->attr[i].type = P_ATTR;
+ pc->attr[i].hw = aid++;
+ pc->attr[i].index = i / 4;
+ }
+ }
+ }
+
+ if (pc->result_nr) {
+ int rid = 0;
+
+ pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
+ if (!pc->result)
+ goto out_err;
+
+ for (i = 0; i < pc->result_nr; i++) {
+ for (c = 0; c < 4; c++) {
+ if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+ pc->result[i*4+c].type = P_TEMP;
+ pc->result[i*4+c].hw = -1;
+ pc->result[i*4+c].rhw = (i == depr) ?
+ -1 : rid++;
+ } else {
+ pc->result[i*4+c].type = P_RESULT;
+ pc->result[i*4+c].hw = rid++;
+ }
+ pc->result[i*4+c].index = i;
+ }
+
+ if (pc->p->type == PIPE_SHADER_FRAGMENT &&
+ depr != 0xffff) {
+ pc->result[depr * 4 + 2].rhw =
+ (pc->result_nr - 1) * 4;
+ }
+ }
+ }
+
+ if (pc->param_nr) {
+ int rid = 0;
+
+ pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
+ if (!pc->param)
+ goto out_err;
+
+ for (i = 0; i < pc->param_nr; i++) {
+ for (c = 0; c < 4; c++) {
+ pc->param[i*4+c].type = P_CONST;
+ pc->param[i*4+c].hw = rid++;
+ pc->param[i*4+c].index = i;
+ }
+ }
+ }
+
+ if (pc->immd_nr) {
+ int rid = 0;
+
+ pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+ if (!pc->immd)
+ goto out_err;
+
+ for (i = 0; i < pc->immd_nr; i++) {
+ for (c = 0; c < 4; c++) {
+ pc->immd[i*4+c].type = P_IMMD;
+ pc->immd[i*4+c].hw = rid++;
+ pc->immd[i*4+c].index = i;
+ }
+ }
+ }
+
+ ret = TRUE;
+out_err:
+ if (r_usage[0])
+ FREE(r_usage[0]);
+ if (r_usage[1])
+ FREE(r_usage[1]);
+
+ tgsi_parse_free(&p);
+ return ret;
+}
+
+static void
+free_nv50_pc(struct nv50_pc *pc)
+{
+ if (pc->immd)
+ FREE(pc->immd);
+ if (pc->param)
+ FREE(pc->param);
+ if (pc->result)
+ FREE(pc->result);
+ if (pc->attr)
+ FREE(pc->attr);
+ if (pc->temp)
+ FREE(pc->temp);
+
+ FREE(pc);
+}
+
+static boolean
+nv50_program_tx(struct nv50_program *p)
+{
+ struct tgsi_parse_context parse;
+ struct nv50_pc *pc;
+ unsigned k;
+ boolean ret;
+
+ pc = CALLOC_STRUCT(nv50_pc);
+ if (!pc)
+ return FALSE;
+ pc->p = p;
+ pc->p->cfg.high_temp = 4;
+
+ ret = nv50_program_tx_prep(pc);
+ if (ret == FALSE)
+ goto out_cleanup;
+
+ tgsi_parse_init(&parse, pc->p->pipe.tokens);
+ while (!tgsi_parse_end_of_tokens(&parse)) {
+ const union tgsi_full_token *tok = &parse.FullToken;
+
+ /* don't allow half insn/immd on first and last instruction */
+ pc->allow32 = TRUE;
+ if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
+ pc->allow32 = FALSE;
+
+ tgsi_parse_token(&parse);
+
+ switch (tok->Token.Type) {
+ case TGSI_TOKEN_TYPE_INSTRUCTION:
+ ++pc->insn_cur;
+ ret = nv50_program_tx_insn(pc, tok);
+ if (ret == FALSE)
+ goto out_err;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (p->type == PIPE_SHADER_FRAGMENT) {
+ struct nv50_reg out;
+
+ out.type = P_TEMP;
+ for (k = 0; k < pc->result_nr * 4; k++) {
+ if (pc->result[k].rhw == -1)
+ continue;
+ if (pc->result[k].hw != pc->result[k].rhw) {
+ out.hw = pc->result[k].rhw;
+ emit_mov(pc, &out, &pc->result[k]);
+ }
+ if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
+ pc->p->cfg.high_result = pc->result[k].rhw + 1;
+ }
+ }
+
+ /* look for single half instructions and make them long */
+ struct nv50_program_exec *e, *e_prev;
+
+ for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
+ if (!is_long(e))
+ k++;
+
+ if (!e->next || is_long(e->next)) {
+ if (k & 1)
+ convert_to_long(pc, e);
+ k = 0;
+ }
+
+ if (e->next)
+ e_prev = e;
+ }
+
+ if (!is_long(pc->p->exec_tail)) {
+ /* this may occur if moving FP results */
+ assert(e_prev && !is_long(e_prev));
+ convert_to_long(pc, e_prev);
+ convert_to_long(pc, pc->p->exec_tail);
+ }
+
+ assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
+ pc->p->exec_tail->inst[1] |= 0x00000001;
+
+ p->param_nr = pc->param_nr * 4;
+ p->immd_nr = pc->immd_nr * 4;
+ p->immd = pc->immd_buf;
+
+out_err:
+ tgsi_parse_free(&parse);
+
+out_cleanup:
+ free_nv50_pc(pc);
+ return ret;
+}
+
+static void
+nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
+{
+ if (nv50_program_tx(p) == FALSE)
+ assert(0);
+ p->translated = TRUE;
+}
+
+static void
+nv50_program_upload_data(struct nv50_context *nv50, float *map,
+ unsigned start, unsigned count, unsigned cbuf)
+{
+ struct nouveau_channel *chan = nv50->screen->base.channel;
+ struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+ while (count) {
+ unsigned nr = count > 2047 ? 2047 : count;
+
+ BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
+ OUT_RING (chan, (cbuf << 0) | (start << 8));
+ BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
+ OUT_RINGp (chan, map, nr);
+
+ map += nr;
+ start += nr;
+ count -= nr;
+ }
+}
+
+static void
+nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
+{
+ struct pipe_screen *pscreen = nv50->pipe.screen;
+
+ if (!p->data[0] && p->immd_nr) {
+ struct nouveau_resource *heap = nv50->screen->immd_heap[0];
+
+ if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
+ while (heap->next && heap->size < p->immd_nr) {
+ struct nv50_program *evict = heap->next->priv;
+ nouveau_resource_free(&evict->data[0]);
+ }
+
+ if (nouveau_resource_alloc(heap, p->immd_nr, p,
+ &p->data[0]))
+ assert(0);
+ }
+
+ /* immediates only need to be uploaded again when freed */
+ nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
+ p->immd_nr, NV50_CB_PMISC);
+ }
+
+ if (!p->data[1] && p->param_nr) {
+ struct nouveau_resource *heap =
+ nv50->screen->parm_heap[p->type];
+
+ if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
+ while (heap->next && heap->size < p->param_nr) {
+ struct nv50_program *evict = heap->next->priv;
+ nouveau_resource_free(&evict->data[1]);
+ }
+
+ if (nouveau_resource_alloc(heap, p->param_nr, p,
+ &p->data[1]))
+ assert(0);
+ }
+ }
+
+ if (p->param_nr) {
+ unsigned cbuf = NV50_CB_PVP;
+ float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
+ PIPE_BUFFER_USAGE_CPU_READ);
+ if (p->type == PIPE_SHADER_FRAGMENT)
+ cbuf = NV50_CB_PFP;
+ nv50_program_upload_data(nv50, map, p->data[1]->start,
+ p->param_nr, cbuf);
+ pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
+ }
+}
+
+static void
+nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
+{
+ struct nouveau_channel *chan = nv50->screen->base.channel;
+ struct nouveau_grobj *tesla = nv50->screen->tesla;
+ struct nv50_program_exec *e;
+ struct nouveau_stateobj *so;
+ const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
+ unsigned start, count, *up, *ptr;
+ boolean upload = FALSE;
+
+ if (!p->bo) {
+ nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
+ p->exec_size * 4, &p->bo);
+ upload = TRUE;
+ }
+
+ if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
+ (p->data[1] && p->data[1]->start != p->data_start[1])) {
+ for (e = p->exec_head; e; e = e->next) {
+ unsigned ei, ci, bs;
+
+ if (e->param.index < 0)
+ continue;
+ bs = (e->inst[1] >> 22) & 0x07;
+ assert(bs < 2);
+ ei = e->param.shift >> 5;
+ ci = e->param.index + p->data[bs]->start;
+
+ e->inst[ei] &= ~e->param.mask;
+ e->inst[ei] |= (ci << e->param.shift);
+ }
+
+ if (p->data[0])
+ p->data_start[0] = p->data[0]->start;
+ if (p->data[1])
+ p->data_start[1] = p->data[1]->start;
+
+ upload = TRUE;
+ }
+
+ if (!upload)
+ return;
+
+#ifdef NV50_PROGRAM_DUMP
+ NOUVEAU_ERR("-------\n");
+ for (e = p->exec_head; e; e = e->next) {
+ NOUVEAU_ERR("0x%08x\n", e->inst[0]);
+ if (is_long(e))
+ NOUVEAU_ERR("0x%08x\n", e->inst[1]);
+ }
+#endif
+
+ up = ptr = MALLOC(p->exec_size * 4);
+ for (e = p->exec_head; e; e = e->next) {
+ *(ptr++) = e->inst[0];
+ if (is_long(e))
+ *(ptr++) = e->inst[1];
+ }
+
+ so = so_new(4,2);
+ so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+ so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
+ so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
+ so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
+
+ start = 0; count = p->exec_size;
+ while (count) {
+ struct nouveau_channel *chan = nv50->screen->base.channel;
+ unsigned nr;
+
+ so_emit(chan, so);
+
+ nr = MIN2(count, 2047);
+ nr = MIN2(chan->pushbuf->remaining, nr);
+ if (chan->pushbuf->remaining < (nr + 3)) {
+ FIRE_RING(chan);
+ continue;
+ }
+
+ BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
+ OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD);
+ BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
+ OUT_RINGp (chan, up + start, nr);
+
+ start += nr;
+ count -= nr;
+ }
+
+ FREE(up);
+ so_ref(NULL, &so);
+}
+
+void
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+ struct nouveau_grobj *tesla = nv50->screen->tesla;
+ struct nv50_program *p = nv50->vertprog;
+ struct nouveau_stateobj *so;
+
+ if (!p->translated) {
+ nv50_program_validate(nv50, p);
+ if (!p->translated)
+ assert(0);
+ }
+
+ nv50_program_validate_data(nv50, p);
+ nv50_program_validate_code(nv50, p);
+
+ so = so_new(13, 2);
+ so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
+ so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+ NOUVEAU_BO_HIGH, 0, 0);
+ so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+ NOUVEAU_BO_LOW, 0, 0);
+ so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
+ so_data (so, p->cfg.vp.attr[0]);
+ so_data (so, p->cfg.vp.attr[1]);
+ so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
+ so_data (so, p->cfg.high_result);
+ so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
+ so_data (so, p->cfg.high_result); //8);
+ so_data (so, p->cfg.high_temp);
+ so_method(so, tesla, NV50TCL_VP_START_ID, 1);
+ so_data (so, 0); /* program start offset */
+ so_ref(so, &nv50->state.vertprog);
+ so_ref(NULL, &so);
+}
+
+void
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+ struct nouveau_grobj *tesla = nv50->screen->tesla;
+ struct nv50_program *p = nv50->fragprog;
+ struct nouveau_stateobj *so;
+ unsigned i;
+
+ if (!p->translated) {
+ nv50_program_validate(nv50, p);
+ if (!p->translated)
+ assert(0);
+ }
+
+ nv50_program_validate_data(nv50, p);
+ nv50_program_validate_code(nv50, p);
+
+ so = so_new(64, 2);
+ so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
+ so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+ NOUVEAU_BO_HIGH, 0, 0);
+ so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+ NOUVEAU_BO_LOW, 0, 0);
+ so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+ so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
+ so_data (so, 0x00000004);
+ so_data (so, 0x00000000);
+ so_data (so, 0x00000000);
+ so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
+ for (i = 0; i < p->cfg.fp.high_map; i++)
+ so_data(so, p->cfg.fp.map[i]);
+ so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
+ so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
+ so_data (so, p->cfg.high_temp);
+ so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
+ so_data (so, p->cfg.high_result);
+ so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
+ so_data (so, p->cfg.fp.regs[2]);
+ so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
+ so_data (so, p->cfg.fp.regs[3]);
+ so_method(so, tesla, NV50TCL_FP_START_ID, 1);
+ so_data (so, 0); /* program start offset */
+ so_ref(so, &nv50->state.fragprog);
+ so_ref(NULL, &so);
+}
+
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
+{
+ while (p->exec_head) {
+ struct nv50_program_exec *e = p->exec_head;
+
+ p->exec_head = e->next;
+ FREE(e);
+ }
+ p->exec_tail = NULL;
+ p->exec_size = 0;
+
+ nouveau_bo_ref(NULL, &p->bo);
+
+ nouveau_resource_free(&p->data[0]);
+ nouveau_resource_free(&p->data[1]);
+
+ p->translated = 0;
+}