diff options
author | Michal Krol <michal@vmware.com> | 2009-12-12 16:48:32 +0100 |
---|---|---|
committer | Michal Krol <michal@vmware.com> | 2009-12-12 16:48:32 +0100 |
commit | a3eb0f718e19653a2ad8e49396c904183be456f3 (patch) | |
tree | 0092574c469ea586a6cab8b8ebb7ac62b8221a2a /src/gallium/drivers/nv50 | |
parent | 491f384c3958067e6c4c994041f5d8d413b806bc (diff) | |
parent | 784cca9fa527de771754d76545970f78094b9adf (diff) |
Merge branch 'master' into glsl-pp-rework-2
Conflicts:
progs/perf/drawoverhead.c
progs/perf/teximage.c
progs/perf/vbo.c
progs/perf/vertexrate.c
src/mesa/shader/slang/library/slang_common_builtin_gc.h
Diffstat (limited to 'src/gallium/drivers/nv50')
-rw-r--r-- | src/gallium/drivers/nv50/nv50_context.c | 34 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_context.h | 34 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_miptree.c | 99 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_program.c | 1087 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_program.h | 3 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_screen.c | 74 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_state.c | 30 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_state_validate.c | 127 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_tex.c | 220 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_texture.h | 15 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_transfer.c | 180 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_vbo.c | 413 |
12 files changed, 1730 insertions, 586 deletions
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c index fca078b174..219e7a7862 100644 --- a/src/gallium/drivers/nv50/nv50_context.c +++ b/src/gallium/drivers/nv50/nv50_context.c @@ -33,13 +33,6 @@ nv50_flush(struct pipe_context *pipe, unsigned flags, { struct nv50_context *nv50 = nv50_context(pipe); struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *eng2d = nv50->screen->eng2d; - - /* We need this in the ddx for reliable composite, not sure what we're - * actually flushing. We generate all our own flushes with flags = 0. */ - WAIT_RING(chan, 2); - BEGIN_RING(chan, eng2d, 0x0110, 1); - OUT_RING (chan, 0); if (flags & PIPE_FLUSH_FRAME) FIRE_RING(chan); @@ -60,29 +53,6 @@ nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield) { } -static unsigned int -nv50_is_texture_referenced( struct pipe_context *pipe, - struct pipe_texture *texture, - unsigned face, unsigned level) -{ - /** - * FIXME: Optimize. - */ - - return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE; -} - -static unsigned int -nv50_is_buffer_referenced( struct pipe_context *pipe, - struct pipe_buffer *buf) -{ - /** - * FIXME: Optimize. - */ - - return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE; -} - struct pipe_context * nv50_create(struct pipe_screen *pscreen, unsigned pctx_id) { @@ -108,8 +78,8 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id) nv50->pipe.flush = nv50_flush; - nv50->pipe.is_texture_referenced = nv50_is_texture_referenced; - nv50->pipe.is_buffer_referenced = nv50_is_buffer_referenced; + nv50->pipe.is_texture_referenced = nouveau_is_texture_referenced; + nv50->pipe.is_buffer_referenced = nouveau_is_buffer_referenced; screen->base.channel->user_private = nv50; screen->base.channel->flush_notify = nv50_state_flush_notify; diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h index 4608854d71..79135f2f36 100644 --- a/src/gallium/drivers/nv50/nv50_context.h +++ b/src/gallium/drivers/nv50/nv50_context.h @@ -14,6 +14,7 @@ #include "nouveau/nouveau_winsys.h" #include "nouveau/nouveau_gldefs.h" #include "nouveau/nouveau_stateobj.h" +#include "nouveau/nouveau_context.h" #include "nv50_screen.h" #include "nv50_program.h" @@ -64,10 +65,22 @@ struct nv50_rasterizer_stateobj { }; struct nv50_sampler_stateobj { - bool normalized; + boolean normalized; unsigned tsc[8]; }; +static INLINE unsigned +get_tile_height(uint32_t tile_mode) +{ + return 1 << ((tile_mode & 0xf) + 2); +} + +static INLINE unsigned +get_tile_depth(uint32_t tile_mode) +{ + return 1 << (tile_mode >> 4); +} + struct nv50_miptree_level { int *image_offset; unsigned pitch; @@ -120,6 +133,7 @@ struct nv50_state { struct nouveau_stateobj *vtxfmt; struct nouveau_stateobj *vtxbuf; struct nouveau_stateobj *vtxattr; + unsigned vtxelt_nr; }; struct nv50_context { @@ -152,6 +166,8 @@ struct nv50_context { unsigned sampler_nr; struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS]; unsigned miptree_nr; + + uint16_t vbo_fifo; }; static INLINE struct nv50_context * @@ -192,13 +208,27 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers, extern void nv50_vertprog_validate(struct nv50_context *nv50); extern void nv50_fragprog_validate(struct nv50_context *nv50); extern void nv50_linkage_validate(struct nv50_context *nv50); -extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p); +extern void nv50_program_destroy(struct nv50_context *nv50, + struct nv50_program *p); /* nv50_state_validate.c */ extern boolean nv50_state_validate(struct nv50_context *nv50); extern void nv50_state_flush_notify(struct nouveau_channel *chan); +extern void nv50_so_init_sifc(struct nv50_context *nv50, + struct nouveau_stateobj *so, + struct nouveau_bo *bo, unsigned reloc, + unsigned size); + /* nv50_tex.c */ extern void nv50_tex_validate(struct nv50_context *); +/* nv50_transfer.c */ +extern void +nv50_upload_sifc(struct nv50_context *nv50, + struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc, + unsigned dst_format, int dst_w, int dst_h, int dst_pitch, + void *src, unsigned src_format, int src_pitch, + int x, int y, int w, int h, int cpp); + #endif diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c index 93479a0314..40ee665999 100644 --- a/src/gallium/drivers/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nv50/nv50_miptree.c @@ -26,15 +26,44 @@ #include "nv50_context.h" +/* The restrictions in tile mode selection probably aren't necessary. */ +static INLINE uint32_t +get_tile_mode(unsigned ny, unsigned d) +{ + uint32_t tile_mode = 0x00; + + if (ny > 32) tile_mode = 0x04; /* height 64 tiles */ + else + if (ny > 16) tile_mode = 0x03; /* height 32 tiles */ + else + if (ny > 8) tile_mode = 0x02; /* height 16 tiles */ + else + if (ny > 4) tile_mode = 0x01; /* height 8 tiles */ + + if (d == 1) + return tile_mode; + else + if (tile_mode > 0x02) + tile_mode = 0x02; + + if (d > 16 && tile_mode < 0x02) + return tile_mode | 0x50; /* depth 32 tiles */ + if (d > 8) return tile_mode | 0x40; /* depth 16 tiles */ + if (d > 4) return tile_mode | 0x30; /* depth 8 tiles */ + if (d > 2) return tile_mode | 0x20; /* depth 4 tiles */ + + return tile_mode | 0x10; +} + static struct pipe_texture * nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp) { struct nouveau_device *dev = nouveau_screen(pscreen)->device; struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree); struct pipe_texture *pt = &mt->base.base; - unsigned width = tmp->width[0], height = tmp->height[0]; - unsigned depth = tmp->depth[0]; - uint32_t tile_mode, tile_flags, tile_h; + unsigned width = tmp->width0, height = tmp->height0; + unsigned depth = tmp->depth0, image_alignment; + uint32_t tile_flags; int ret, i, l; *pt = *tmp; @@ -57,62 +86,44 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp) break; } - if (pt->height[0] > 32) tile_mode = 4; - else if (pt->height[0] > 16) tile_mode = 3; - else if (pt->height[0] > 8) tile_mode = 2; - else if (pt->height[0] > 4) tile_mode = 1; - else tile_mode = 0; - tile_h = 1 << (tile_mode + 2); - - switch (pt->target) { - case PIPE_TEXTURE_3D: - mt->image_nr = pt->depth[0]; - break; - case PIPE_TEXTURE_CUBE: - mt->image_nr = 6; - break; - default: - mt->image_nr = 1; - break; - } + /* XXX: texture arrays */ + mt->image_nr = (pt->target == PIPE_TEXTURE_CUBE) ? 6 : 1; for (l = 0; l <= pt->last_level; l++) { struct nv50_miptree_level *lvl = &mt->level[l]; - - pt->width[l] = width; - pt->height[l] = height; - pt->depth[l] = depth; - pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width); - pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height); + unsigned nblocksy = pf_get_nblocksy(pt->format, height); lvl->image_offset = CALLOC(mt->image_nr, sizeof(int)); - lvl->pitch = align(pt->width[l] * pt->block.size, 64); - lvl->tile_mode = tile_mode; - - width = MAX2(1, width >> 1); - height = MAX2(1, height >> 1); - depth = MAX2(1, depth >> 1); + lvl->pitch = align(pf_get_stride(pt->format, width), 64); + lvl->tile_mode = get_tile_mode(nblocksy, depth); - if (tile_mode && height <= (tile_h >> 1)) { - tile_mode--; - tile_h >>= 1; - } + width = u_minify(width, 1); + height = u_minify(height, 1); + depth = u_minify(depth, 1); } + image_alignment = get_tile_height(mt->level[0].tile_mode) * 64; + image_alignment *= get_tile_depth(mt->level[0].tile_mode); + + /* NOTE the distinction between arrays of mip-mapped 2D textures and + * mip-mapped 3D textures. We can't use image_nr == depth for 3D mip. + */ for (i = 0; i < mt->image_nr; i++) { for (l = 0; l <= pt->last_level; l++) { struct nv50_miptree_level *lvl = &mt->level[l]; int size; - tile_h = 1 << (lvl->tile_mode + 2); + unsigned tile_h = get_tile_height(lvl->tile_mode); + unsigned tile_d = get_tile_depth(lvl->tile_mode); - size = align(pt->width[l], 8) * pt->block.size; - size = align(size, 64); - size *= align(pt->height[l], tile_h); + size = lvl->pitch; + size *= align(pf_get_nblocksy(pt->format, u_minify(pt->height0, l)), tile_h); + size *= align(u_minify(pt->depth0, l), tile_d); lvl->image_offset[i] = mt->total_size; mt->total_size += size; } + mt->total_size = align(mt->total_size, image_alignment); } ret = nouveau_bo_new_tile(dev, NOUVEAU_BO_VRAM, 256, mt->total_size, @@ -135,7 +146,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt, /* Only supports 2D, non-mipmapped textures for the moment */ if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 || - pt->depth[0] != 1) + pt->depth0 != 1) return NULL; mt = CALLOC_STRUCT(nv50_miptree); @@ -186,8 +197,8 @@ nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt, return NULL; pipe_texture_reference(&ps->texture, pt); ps->format = pt->format; - ps->width = pt->width[level]; - ps->height = pt->height[level]; + ps->width = u_minify(pt->width0, level); + ps->height = u_minify(pt->height0, level); ps->usage = flags; pipe_reference_init(&ps->reference, 1); ps->face = face; diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index eb90d5e66f..f0fe7e6168 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -31,9 +31,12 @@ #include "nv50_context.h" -#define NV50_SU_MAX_TEMP 64 +#define NV50_SU_MAX_TEMP 127 +#define NV50_SU_MAX_ADDR 4 //#define NV50_PROGRAM_DUMP +/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ + /* ARL - gallium craps itself on progs/vp/arl.txt * * MSB - Like MAD, but MUL+SUB @@ -79,22 +82,32 @@ struct nv50_reg { P_ATTR, P_RESULT, P_CONST, - P_IMMD + P_IMMD, + P_ADDR } type; int index; int hw; - int neg; + int mod; int rhw; /* result hw for FP outputs, or interpolant index */ int acc; /* instruction where this reg is last read (first insn == 1) */ }; +#define NV50_MOD_NEG 1 +#define NV50_MOD_ABS 2 +#define NV50_MOD_SAT 4 + +/* arbitrary limits */ +#define MAX_IF_DEPTH 4 +#define MAX_LOOP_DEPTH 4 + struct nv50_pc { struct nv50_program *p; /* hw resources */ struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; + struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; /* tgsi resources */ struct nv50_reg *temp; @@ -108,6 +121,8 @@ struct nv50_pc { struct nv50_reg *immd; float *immd_buf; int immd_nr; + struct nv50_reg **addr; + int addr_nr; struct nv50_reg *temp_temp[16]; unsigned temp_temp_nr; @@ -121,6 +136,13 @@ struct nv50_pc { struct nv50_reg *iv_p; struct nv50_reg *iv_c; + struct nv50_program_exec *if_cond; + struct nv50_program_exec *if_insn[MAX_IF_DEPTH]; + struct nv50_program_exec *br_join[MAX_IF_DEPTH]; + struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */ + int if_lvl, loop_lvl; + unsigned loop_pos[MAX_LOOP_DEPTH]; + /* current instruction and total number of insns */ unsigned insn_cur; unsigned insn_nr; @@ -134,7 +156,7 @@ ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) reg->type = type; reg->index = index; reg->hw = hw; - reg->neg = 0; + reg->mod = 0; reg->rhw = -1; reg->acc = 0; } @@ -148,6 +170,17 @@ popcnt4(uint32_t val) } static void +terminate_mbb(struct nv50_pc *pc) +{ + int i; + + /* remove records of temporary address register values */ + for (i = 0; i < NV50_SU_MAX_ADDR; ++i) + if (pc->r_addr[i].index < 0) + pc->r_addr[i].rhw = -1; +} + +static void alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) { int i = 0; @@ -196,6 +229,10 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) assert(0); } +/* XXX: For shaders that aren't executed linearly (e.g. shaders that + * contain loops), we need to assign all hw regs to TGSI TEMPs early, + * lest we risk temp_temps overwriting regs alloc'd "later". + */ static struct nv50_reg * alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) { @@ -419,14 +456,20 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) } alloc_reg(pc, dst); + if (dst->hw > 63) + set_long(pc, e); e->inst[0] |= (dst->hw << 2); } static INLINE void set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) { + unsigned val; float f = pc->immd_buf[imm->hw]; - unsigned val = fui(imm->neg ? -f : f); + + if (imm->mod & NV50_MOD_ABS) + f = fabsf(f); + val = fui((imm->mod & NV50_MOD_NEG) ? -f : f); set_long(pc, e); /*XXX: can't be predicated - bits overlap.. catch cases where both @@ -439,9 +482,96 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) e->inst[1] |= (val >> 6) << 2; } +static INLINE void +set_addr(struct nv50_program_exec *e, struct nv50_reg *a) +{ + assert(!(e->inst[0] & 0x0c000000)); + assert(!(e->inst[1] & 0x00000004)); + + e->inst[0] |= (a->hw & 3) << 26; + e->inst[1] |= (a->hw >> 2) << 2; +} + +static void +emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, + struct nv50_reg *src0, uint16_t src1_val) +{ + struct nv50_program_exec *e = exec(pc); + + e->inst[0] = 0xd0000000 | (src1_val << 9); + e->inst[1] = 0x20000000; + set_long(pc, e); + e->inst[0] |= dst->hw << 2; + if (src0) /* otherwise will add to $a0, which is always 0 */ + set_addr(e, src0); + + emit(pc, e); +} + +static struct nv50_reg * +alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref) +{ + int i; + struct nv50_reg *a_tgsi = NULL, *a = NULL; + + if (!ref) { + /* allocate for TGSI address reg */ + for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { + if (pc->r_addr[i].index >= 0) + continue; + if (pc->r_addr[i].rhw >= 0 && + pc->r_addr[i].acc == pc->insn_cur) + continue; + + pc->r_addr[i].rhw = -1; + pc->r_addr[i].index = i; + return &pc->r_addr[i]; + } + assert(0); + return NULL; + } + + /* Allocate and set an address reg so we can access 'ref'. + * + * If and r_addr has index < 0, it is not reserved for TGSI, + * and index will be the negative of the TGSI addr index the + * value in rhw is relative to, or -256 if rhw is an offset + * from 0. If rhw < 0, the reg has not been initialized. + */ + for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) { + if (pc->r_addr[i].index >= 0) /* occupied for TGSI */ + continue; + if (pc->r_addr[i].rhw < 0) { /* unused */ + a = &pc->r_addr[i]; + continue; + } + if (!a && pc->r_addr[i].acc != pc->insn_cur) + a = &pc->r_addr[i]; + + if (ref->hw - pc->r_addr[i].rhw >= 128) + continue; + + if ((ref->acc >= 0 && pc->r_addr[i].index == -256) || + (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) { + pc->r_addr[i].acc = pc->insn_cur; + return &pc->r_addr[i]; + } + } + assert(a); + + if (ref->acc < 0) + a_tgsi = pc->addr[ref->index]; + + emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4); + + a->rhw = ref->hw & ~0x7f; + a->acc = pc->insn_cur; + a->index = a_tgsi ? -ref->index : -256; + return a; +} #define INTERP_LINEAR 0 -#define INTERP_FLAT 1 +#define INTERP_FLAT 1 #define INTERP_PERSPECTIVE 2 #define INTERP_CENTROID 4 @@ -479,10 +609,18 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, { set_long(pc, e); - e->param.index = src->hw; + e->param.index = src->hw & 127; e->param.shift = s; e->param.mask = m << (s % 32); + if (src->hw > 127) + set_addr(e, alloc_addr(pc, src)); + else + if (src->acc < 0) { + assert(src->type == P_CONST); + set_addr(e, pc->addr[src->index]); + } + e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); } @@ -491,11 +629,13 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) { struct nv50_program_exec *e = exec(pc); - e->inst[0] |= 0x10000000; + e->inst[0] = 0x10000000; + if (!pc->allow32) + set_long(pc, e); set_dst(pc, dst, e); - if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) { + if (!is_long(e) && src->type == P_IMMD) { set_immd(pc, src, e); /*XXX: 32-bit, but steals part of "half" reg space - need to * catch and handle this case if/when we do half-regs @@ -512,6 +652,8 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) } alloc_reg(pc, src); + if (src->hw > 63) + set_long(pc, e); e->inst[0] |= (src->hw << 9); } @@ -559,6 +701,24 @@ check_swap_src_0_1(struct nv50_pc *pc, } static void +set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, + struct nv50_program_exec *e) +{ + struct nv50_reg *temp; + + if (src->type != P_TEMP) { + temp = temp_temp(pc); + emit_mov(pc, temp, src); + src = temp; + } + + alloc_reg(pc, src); + if (src->hw > 63) + set_long(pc, e); + e->inst[0] |= (src->hw << 9); +} + +static void set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) { if (src->type == P_ATTR) { @@ -573,6 +733,8 @@ set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) } alloc_reg(pc, src); + if (src->hw > 63) + set_long(pc, e); e->inst[0] |= (src->hw << 9); } @@ -599,7 +761,9 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) } alloc_reg(pc, src); - e->inst[0] |= (src->hw << 16); + if (src->hw > 63) + set_long(pc, e); + e->inst[0] |= ((src->hw & 127) << 16); } static void @@ -627,7 +791,7 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) } alloc_reg(pc, src); - e->inst[1] |= (src->hw << 14); + e->inst[1] |= ((src->hw & 127) << 14); } static void @@ -645,12 +809,12 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, set_dst(pc, dst, e); set_src_0(pc, src0, e); if (src1->type == P_IMMD && !is_long(e)) { - if (src0->neg) + if (src0->mod & NV50_MOD_NEG) e->inst[0] |= 0x00008000; set_immd(pc, src1, e); } else { set_src_1(pc, src1, e); - if (src0->neg ^ src1->neg) { + if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { if (is_long(e)) e->inst[1] |= 0x08000000; else @@ -667,13 +831,15 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst, { struct nv50_program_exec *e = exec(pc); - e->inst[0] |= 0xb0000000; + e->inst[0] = 0xb0000000; + alloc_reg(pc, src1); check_swap_src_0_1(pc, &src0, &src1); - if (!pc->allow32 || src0->neg || src1->neg) { + if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { set_long(pc, e); - e->inst[1] |= (src0->neg << 26) | (src1->neg << 27); + e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | + ((src1->mod & NV50_MOD_NEG) << 27); } set_dst(pc, dst, e); @@ -690,6 +856,22 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst, } static void +emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, + uint8_t s) +{ + struct nv50_program_exec *e = exec(pc); + + set_long(pc, e); + e->inst[1] |= 0xc0000000; + + e->inst[0] |= dst->hw << 2; + e->inst[0] |= s << 16; /* shift left */ + set_src_0_restricted(pc, src, e); + + emit(pc, e); +} + +static void emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, struct nv50_reg *src0, struct nv50_reg *src1) { @@ -704,6 +886,11 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, set_src_0(pc, src0, e); set_src_1(pc, src1, e); + if (src0->mod & NV50_MOD_ABS) + e->inst[1] |= 0x00100000; + if (src1->mod & NV50_MOD_ABS) + e->inst[1] |= 0x00080000; + emit(pc, e); } @@ -711,9 +898,47 @@ static INLINE void emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, struct nv50_reg *src1) { - src1->neg ^= 1; + assert(src0 != src1); + src1->mod ^= NV50_MOD_NEG; emit_add(pc, dst, src0, src1); - src1->neg ^= 1; + src1->mod ^= NV50_MOD_NEG; +} + +static void +emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, + struct nv50_reg *src1, unsigned op) +{ + struct nv50_program_exec *e = exec(pc); + + e->inst[0] = 0xd0000000; + set_long(pc, e); + + check_swap_src_0_1(pc, &src0, &src1); + set_dst(pc, dst, e); + set_src_0(pc, src0, e); + + if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && + op != TGSI_OPCODE_XOR) + assert(!"invalid bit op"); + + if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { + set_immd(pc, src1, e); + if (op == TGSI_OPCODE_OR) + e->inst[0] |= 0x0100; + else + if (op == TGSI_OPCODE_XOR) + e->inst[0] |= 0x8000; + } else { + set_src_1(pc, src1, e); + e->inst[1] |= 0x04000000; /* 32 bit */ + if (op == TGSI_OPCODE_OR) + e->inst[1] |= 0x4000; + else + if (op == TGSI_OPCODE_XOR) + e->inst[1] |= 0x8000; + } + + emit(pc, e); } static void @@ -730,9 +955,9 @@ emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, set_src_1(pc, src1, e); set_src_2(pc, src2, e); - if (src0->neg ^ src1->neg) + if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) e->inst[1] |= 0x04000000; - if (src2->neg) + if (src2->mod & NV50_MOD_NEG) e->inst[1] |= 0x08000000; emit(pc, e); @@ -742,9 +967,10 @@ static INLINE void emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) { - src2->neg ^= 1; + assert(src2 != src0 && src2 != src1); + src2->mod ^= NV50_MOD_NEG; emit_mad(pc, dst, src0, src1, src2); - src2->neg ^= 1; + src2->mod ^= NV50_MOD_NEG; } static void @@ -760,7 +986,11 @@ emit_flop(struct nv50_pc *pc, unsigned sub, } set_dst(pc, dst, e); - set_src_0(pc, src, e); + + if (sub == 0 || sub == 2) + set_src_0_restricted(pc, src, e); + else + set_src_0(pc, src, e); emit(pc, e); } @@ -802,15 +1032,15 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) #define CVTOP_SAT 0x08 #define CVTOP_ABS 0x10 -/* 0x04 == 32 bit */ +/* 0x04 == 32 bit dst */ /* 0x40 == dst is float */ /* 0x80 == src is float */ #define CVT_F32_F32 0xc4 #define CVT_F32_S32 0x44 -#define CVT_F32_U32 0x64 #define CVT_S32_F32 0x8c #define CVT_S32_S32 0x0c -#define CVT_F32_F32_ROP 0xcc +#define CVT_NEG 0x20 +#define CVT_RI 0x08 static void emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, @@ -822,7 +1052,7 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, set_long(pc, e); e->inst[0] |= 0xa0000000; - e->inst[1] |= 0x00004000; + e->inst[1] |= 0x00004000; /* 32 bit src */ e->inst[1] |= (cvn << 16); e->inst[1] |= (fmt << 24); set_src_0(pc, src, e); @@ -890,6 +1120,7 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, set_src_1(pc, src1, e); emit(pc, e); + pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */ /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ if (rdst) @@ -917,7 +1148,7 @@ map_tgsi_setop_cc(unsigned op) static INLINE void emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) { - emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP); + emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI); } static void @@ -1000,20 +1231,10 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, FREE(one); } -static void +static INLINE void emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) { - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[0] |= 0xa0000000; /* delta */ - e->inst[1] |= (7 << 29); /* delta */ - e->inst[1] |= 0x04000000; /* negate arg0? probably not */ - e->inst[1] |= (1 << 14); /* src .f32 */ - set_dst(pc, dst, e); - set_src_0(pc, src, e); - - emit(pc, e); + emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG); } static void @@ -1021,30 +1242,52 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src) { struct nv50_program_exec *e; const int r_pred = 1; + unsigned cvn = CVT_F32_F32; - /* Sets predicate reg ? */ - e = exec(pc); - e->inst[0] = 0xa00001fd; - e->inst[1] = 0xc4014788; - set_src_0(pc, src, e); - set_pred_wr(pc, 1, r_pred, e); - if (src->neg) - e->inst[1] |= 0x20000000; - emit(pc, e); + if (src->mod & NV50_MOD_NEG) + cvn |= CVT_NEG; + /* write predicate reg */ + emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn); - /* This is probably KILP */ + /* conditional discard */ e = exec(pc); - e->inst[0] = 0x000001fe; + e->inst[0] = 0x00000002; set_long(pc, e); - set_pred(pc, 1 /* LT? */, r_pred, e); + set_pred(pc, 0x1 /* LT */, r_pred, e); emit(pc, e); } static void +load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], + struct nv50_reg **src, boolean proj) +{ + int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; + + src[0]->mod |= NV50_MOD_ABS; + src[1]->mod |= NV50_MOD_ABS; + src[2]->mod |= NV50_MOD_ABS; + + emit_minmax(pc, 4, t[2], src[0], src[1]); + emit_minmax(pc, 4, t[2], src[2], t[2]); + + src[0]->mod = mod[0]; + src[1]->mod = mod[1]; + src[2]->mod = mod[2]; + + if (proj && 0 /* looks more correct without this */) + emit_mul(pc, t[2], t[2], src[3]); + emit_flop(pc, 0, t[2], t[2]); + + emit_mul(pc, t[0], src[0], t[2]); + emit_mul(pc, t[1], src[1], t[2]); + emit_mul(pc, t[2], src[2], t[2]); +} + +static void emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) { - struct nv50_reg *temp, *t[4]; + struct nv50_reg *t[4]; struct nv50_program_exec *e; unsigned c, mode, dim; @@ -1073,6 +1316,9 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, /* some cards need t[0]'s hw index to be a multiple of 4 */ alloc_temp4(pc, t, 0); + if (type == TGSI_TEXTURE_CUBE) { + load_cube_tex_coords(pc, t, src, proj); + } else if (proj) { if (src[0]->type == P_TEMP && src[0]->rhw != -1) { mode = pc->interp_mode[src[0]->index]; @@ -1097,17 +1343,8 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, */ } } else { - if (type == TGSI_TEXTURE_CUBE) { - temp = temp_temp(pc); - emit_minmax(pc, 4, temp, src[0], src[1]); - emit_minmax(pc, 4, temp, temp, src[2]); - emit_flop(pc, 0, temp, temp); - for (c = 0; c < 3; c++) - emit_mul(pc, t[c], src[c], temp); - } else { - for (c = 0; c < dim; c++) - emit_mov(pc, t[c], src[c]); - } + for (c = 0; c < dim; c++) + emit_mov(pc, t[c], src[c]); } e = exec(pc); @@ -1120,19 +1357,22 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, if (dim == 2) e->inst[0] |= 0x00400000; else - if (dim == 3) + if (dim == 3) { e->inst[0] |= 0x00800000; + if (type == TGSI_TEXTURE_CUBE) + e->inst[0] |= 0x08000000; + } e->inst[0] |= (mask & 0x3) << 25; e->inst[1] |= (mask & 0xc) << 12; emit(pc, e); - #if 1 - if (mask & 1) emit_mov(pc, dst[0], t[0]); - if (mask & 2) emit_mov(pc, dst[1], t[1]); - if (mask & 4) emit_mov(pc, dst[2], t[2]); - if (mask & 8) emit_mov(pc, dst[3], t[3]); + c = 0; + if (mask & 1) emit_mov(pc, dst[0], t[c++]); + if (mask & 2) emit_mov(pc, dst[1], t[c++]); + if (mask & 4) emit_mov(pc, dst[2], t[c++]); + if (mask & 8) emit_mov(pc, dst[3], t[c]); free_temp4(pc, t); #else @@ -1149,6 +1389,81 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, } static void +emit_branch(struct nv50_pc *pc, int pred, unsigned cc, + struct nv50_program_exec **join) +{ + struct nv50_program_exec *e = exec(pc); + + if (join) { + set_long(pc, e); + e->inst[0] |= 0xa0000002; + emit(pc, e); + *join = e; + e = exec(pc); + } + + set_long(pc, e); + e->inst[0] |= 0x10000002; + if (pred >= 0) + set_pred(pc, cc, pred, e); + emit(pc, e); +} + +static void +emit_nop(struct nv50_pc *pc) +{ + struct nv50_program_exec *e = exec(pc); + + e->inst[0] = 0xf0000000; + set_long(pc, e); + e->inst[1] = 0xe0000000; + emit(pc, e); +} + +static void +emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) +{ + struct nv50_program_exec *e = exec(pc); + + assert(src->type == P_TEMP); + + e->inst[0] = 0xc0140000; + e->inst[1] = 0x89800000; + set_long(pc, e); + set_dst(pc, dst, e); + set_src_0(pc, src, e); + set_src_2(pc, src, e); + + emit(pc, e); +} + +static void +emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) +{ + struct nv50_reg *r = src; + struct nv50_program_exec *e = exec(pc); + + assert(src->type == P_TEMP); + + if (!(src->mod & NV50_MOD_NEG)) { /* ! double negation */ + r = alloc_temp(pc, NULL); + emit_neg(pc, r, src); + } + + e->inst[0] = 0xc0150000; + e->inst[1] = 0x8a400000; + set_long(pc, e); + set_dst(pc, dst, e); + set_src_0(pc, r, e); + set_src_2(pc, r, e); + + if (r != src) + free_temp(pc, r); + + emit(pc, e); +} + +static void convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) { unsigned q = 0, m = ~0; @@ -1196,10 +1511,14 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) e->inst[1] |= q; } +/* Some operations support an optional negation flag. */ static boolean negate_supported(const struct tgsi_full_instruction *insn, int i) { + int s; + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_DDY: case TGSI_OPCODE_DP3: case TGSI_OPCODE_DP4: case TGSI_OPCODE_MUL: @@ -1207,19 +1526,36 @@ negate_supported(const struct tgsi_full_instruction *insn, int i) case TGSI_OPCODE_ADD: case TGSI_OPCODE_SUB: case TGSI_OPCODE_MAD: - return TRUE; + break; case TGSI_OPCODE_POW: - return (i == 1) ? TRUE : FALSE; + if (i == 1) + break; + return FALSE; default: return FALSE; } + + /* Watch out for possible multiple uses of an nv50_reg, we + * can't use nv50_reg::neg in these cases. + */ + for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) { + if (s == i) + continue; + if ((insn->Src[s].Register.Index == + insn->Src[i].Register.Index) && + (insn->Src[s].Register.File == + insn->Src[i].Register.File)) + return FALSE; + } + + return TRUE; } /* Return a read mask for source registers deduced from opcode & write mask. */ static unsigned nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) { - unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask; + unsigned x, mask = insn->Dst[0].Register.WriteMask; switch (insn->Instruction.Opcode) { case TGSI_OPCODE_COS: @@ -1245,10 +1581,10 @@ nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) case TGSI_OPCODE_TEX: case TGSI_OPCODE_TXP: { - const struct tgsi_instruction_ext_texture *tex; + const struct tgsi_instruction_texture *tex; - assert(insn->Instruction.Extended); - tex = &insn->InstructionExtTexture; + assert(insn->Instruction.Texture); + tex = &insn->Texture; mask = 0x7; if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) @@ -1282,11 +1618,21 @@ nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) static struct nv50_reg * tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) { - switch (dst->DstRegister.File) { + switch (dst->Register.File) { case TGSI_FILE_TEMPORARY: - return &pc->temp[dst->DstRegister.Index * 4 + c]; + return &pc->temp[dst->Register.Index * 4 + c]; case TGSI_FILE_OUTPUT: - return &pc->result[dst->DstRegister.Index * 4 + c]; + return &pc->result[dst->Register.Index * 4 + c]; + case TGSI_FILE_ADDRESS: + { + struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; + if (!r) { + r = alloc_addr(pc, NULL); + pc->addr[dst->Register.Index * 4 + c] = r; + } + assert(r); + return r; + } case TGSI_FILE_NULL: return NULL; default: @@ -1302,43 +1648,56 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, { struct nv50_reg *r = NULL; struct nv50_reg *temp; - unsigned sgn, c; + unsigned sgn, c, swz; + + if (src->Register.File != TGSI_FILE_CONSTANT) + assert(!src->Register.Indirect); sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); - c = tgsi_util_get_full_src_register_extswizzle(src, chan); + c = tgsi_util_get_full_src_register_swizzle(src, chan); switch (c) { - case TGSI_EXTSWIZZLE_X: - case TGSI_EXTSWIZZLE_Y: - case TGSI_EXTSWIZZLE_Z: - case TGSI_EXTSWIZZLE_W: - switch (src->SrcRegister.File) { + case TGSI_SWIZZLE_X: + case TGSI_SWIZZLE_Y: + case TGSI_SWIZZLE_Z: + case TGSI_SWIZZLE_W: + switch (src->Register.File) { case TGSI_FILE_INPUT: - r = &pc->attr[src->SrcRegister.Index * 4 + c]; + r = &pc->attr[src->Register.Index * 4 + c]; break; case TGSI_FILE_TEMPORARY: - r = &pc->temp[src->SrcRegister.Index * 4 + c]; + r = &pc->temp[src->Register.Index * 4 + c]; break; case TGSI_FILE_CONSTANT: - r = &pc->param[src->SrcRegister.Index * 4 + c]; + if (!src->Register.Indirect) { + r = &pc->param[src->Register.Index * 4 + c]; + break; + } + /* Indicate indirection by setting r->acc < 0 and + * use the index field to select the address reg. + */ + r = MALLOC_STRUCT(nv50_reg); + swz = tgsi_util_get_src_register_swizzle( + &src->Indirect, 0); + ctor_reg(r, P_CONST, + src->Indirect.Index * 4 + swz, + src->Register.Index * 4 + c); + r->acc = -1; break; case TGSI_FILE_IMMEDIATE: - r = &pc->immd[src->SrcRegister.Index * 4 + c]; + r = &pc->immd[src->Register.Index * 4 + c]; break; case TGSI_FILE_SAMPLER: break; + case TGSI_FILE_ADDRESS: + r = pc->addr[src->Register.Index * 4 + c]; + assert(r); + break; default: assert(0); break; } break; - case TGSI_EXTSWIZZLE_ZERO: - r = alloc_immd(pc, 0.0); - return r; - case TGSI_EXTSWIZZLE_ONE: - if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) - return alloc_immd(pc, -1.0); - return alloc_immd(pc, 1.0); default: assert(0); break; @@ -1354,7 +1713,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, break; case TGSI_UTIL_SIGN_TOGGLE: if (neg) - r->neg = 1; + r->mod = NV50_MOD_NEG; else { temp = temp_temp(pc); emit_neg(pc, temp, r); @@ -1363,11 +1722,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, break; case TGSI_UTIL_SIGN_SET: temp = temp_temp(pc); - emit_abs(pc, temp, r); - if (neg) - temp->neg = 1; - else - emit_neg(pc, temp, temp); + emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG); r = temp; break; default: @@ -1444,6 +1799,55 @@ nv50_tgsi_dst_revdep(unsigned op, int s, int c) } } +static INLINE boolean +has_pred(struct nv50_program_exec *e, unsigned cc) +{ + if (!is_long(e) || is_immd(e)) + return FALSE; + return ((e->inst[1] & 0x780) == (cc << 7)); +} + +/* on ENDIF see if we can do "@p0.neu single_op" instead of: + * join_at ENDIF + * @p0.eq bra ENDIF + * single_op + * ENDIF: nop.join + */ +static boolean +nv50_kill_branch(struct nv50_pc *pc) +{ + int lvl = pc->if_lvl; + + if (pc->if_insn[lvl]->next != pc->p->exec_tail) + return FALSE; + + /* if ccode == 'true', the BRA is from an ELSE and the predicate + * reg may no longer be valid, since we currently always use $p0 + */ + if (has_pred(pc->if_insn[lvl], 0xf)) + return FALSE; + assert(pc->if_insn[lvl] && pc->br_join[lvl]); + + /* We'll use the exec allocated for JOIN_AT (as we can't easily + * update prev's next); if exec_tail is BRK, update the pointer. + */ + if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail) + pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl]; + + pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ + + *pc->br_join[lvl] = *pc->p->exec_tail; + + FREE(pc->if_insn[lvl]); + FREE(pc->p->exec_tail); + + pc->p->exec_tail = pc->br_join[lvl]; + pc->p->exec_tail->next = NULL; + set_pred(pc, 0xd, 0, pc->p->exec_tail); + + return TRUE; +} + static boolean nv50_program_tx_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *inst) @@ -1452,29 +1856,29 @@ nv50_program_tx_insn(struct nv50_pc *pc, unsigned mask, sat, unit; int i, c; - mask = inst->FullDstRegisters[0].DstRegister.WriteMask; + mask = inst->Dst[0].Register.WriteMask; sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; memset(src, 0, sizeof(src)); for (c = 0; c < 4; c++) { if ((mask & (1 << c)) && !pc->r_dst[c]) - dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); + dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); else dst[c] = pc->r_dst[c]; rdst[c] = dst[c]; } for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; + const struct tgsi_full_src_register *fs = &inst->Src[i]; unsigned src_mask; boolean neg_supp; src_mask = nv50_tgsi_src_mask(inst, i); neg_supp = negate_supported(inst, i); - if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) - unit = fs->SrcRegister.Index; + if (fs->Register.File == TGSI_FILE_SAMPLER) + unit = fs->Register.Index; for (c = 0; c < 4; c++) if (src_mask & (1 << c)) @@ -1491,7 +1895,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, for (c = 0; c < 4; c++) { if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) continue; - rdst[c] = dst[c]; + /* rdst[c] = dst[c]; */ /* done above */ dst[c] = temp_temp(pc); } } @@ -1513,12 +1917,49 @@ nv50_program_tx_insn(struct nv50_pc *pc, emit_add(pc, dst[c], src[0][c], src[1][c]); } break; + case TGSI_OPCODE_AND: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_OR: + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_bitop2(pc, dst[c], src[0][c], src[1][c], + inst->Instruction.Opcode); + } + break; + case TGSI_OPCODE_ARL: + assert(src[0][0]); + temp = temp_temp(pc); + emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32); + emit_arl(pc, dst[0], temp, 4); + break; + case TGSI_OPCODE_BGNLOOP: + pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; + terminate_mbb(pc); + break; + case TGSI_OPCODE_BRK: + emit_branch(pc, -1, 0, NULL); + assert(pc->loop_lvl > 0); + pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail; + break; case TGSI_OPCODE_CEIL: for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; emit_cvt(pc, dst[c], src[0][c], -1, - CVTOP_CEIL, CVT_F32_F32); + CVTOP_CEIL, CVT_F32_F32 | CVT_RI); + } + break; + case TGSI_OPCODE_CMP: + pc->allow32 = FALSE; + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32); + emit_mov(pc, dst[c], src[1][c]); + set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ + emit_mov(pc, dst[c], src[2][c]); + set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ } break; case TGSI_OPCODE_COS: @@ -1533,6 +1974,20 @@ nv50_program_tx_insn(struct nv50_pc *pc, emit_precossin(pc, temp, src[0][0]); emit_flop(pc, 5, brdc, temp); break; + case TGSI_OPCODE_DDX: + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_ddx(pc, dst[c], src[0][c]); + } + break; + case TGSI_OPCODE_DDY: + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_ddy(pc, dst[c], src[0][c]); + } + break; case TGSI_OPCODE_DP3: emit_mul(pc, temp, src[0][0], src[1][0]); emit_mad(pc, temp, src[0][1], src[1][1], temp); @@ -1560,6 +2015,36 @@ nv50_program_tx_insn(struct nv50_pc *pc, if (mask & (1 << 0)) emit_mov_immdval(pc, dst[0], 1.0f); break; + case TGSI_OPCODE_ELSE: + emit_branch(pc, -1, 0, NULL); + pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; + pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; + terminate_mbb(pc); + break; + case TGSI_OPCODE_ENDIF: + pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; + + /* try to replace branch over 1 insn with a predicated insn */ + if (nv50_kill_branch(pc) == TRUE) + break; + + if (pc->br_join[pc->if_lvl]) { + pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size; + pc->br_join[pc->if_lvl] = NULL; + } + terminate_mbb(pc); + /* emit a NOP as join point, we could set it on the next + * one, but would have to make sure it is long and !immd + */ + emit_nop(pc); + pc->p->exec_tail->inst[1] |= 2; + break; + case TGSI_OPCODE_ENDLOOP: + emit_branch(pc, -1, 0, NULL); + pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl]; + pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size; + terminate_mbb(pc); + break; case TGSI_OPCODE_EX2: emit_preex2(pc, temp, src[0][0]); emit_flop(pc, 6, brdc, temp); @@ -1580,6 +2065,16 @@ nv50_program_tx_insn(struct nv50_pc *pc, emit_sub(pc, dst[c], src[0][c], temp); } break; + case TGSI_OPCODE_IF: + /* emitting a join_at may not be necessary */ + assert(pc->if_lvl < MAX_IF_DEPTH); + /* set_pred_wr(pc, 1, 0, pc->if_cond); */ + emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN, + CVT_F32_F32); + emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]); + pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; + terminate_mbb(pc); + break; case TGSI_OPCODE_KIL: emit_kil(pc, src[0][0]); emit_kil(pc, src[0][1]); @@ -1623,7 +2118,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, } break; case TGSI_OPCODE_MOV: - case TGSI_OPCODE_SWZ: for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; @@ -1693,18 +2187,18 @@ nv50_program_tx_insn(struct nv50_pc *pc, break; case TGSI_OPCODE_TEX: emit_tex(pc, dst, mask, src[0], unit, - inst->InstructionExtTexture.Texture, FALSE); + inst->Texture.Texture, FALSE); break; case TGSI_OPCODE_TXP: emit_tex(pc, dst, mask, src[0], unit, - inst->InstructionExtTexture.Texture, TRUE); + inst->Texture.Texture, TRUE); break; case TGSI_OPCODE_TRUNC: for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; emit_cvt(pc, dst[c], src[0][c], -1, - CVTOP_TRUNC, CVT_F32_F32); + CVTOP_TRUNC, CVT_F32_F32 | CVT_RI); } break; case TGSI_OPCODE_XPD: @@ -1742,8 +2236,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - /* in this case we saturate later */ - if (dst[c]->type == P_TEMP && dst[c]->index < 0) + /* In this case we saturate later, and dst[c] won't + * be another temp_temp (and thus lost), since rdst + * already is TEMP (see above). */ + if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) continue; emit_sat(pc, rdst[c], dst[c]); } @@ -1753,8 +2249,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, for (c = 0; c < 4; c++) { if (!src[i][c]) continue; + src[i][c]->mod = 0; if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) FREE(src[i][c]); + else + if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST) + FREE(src[i][c]); /* indirect constant */ } } @@ -1770,7 +2270,7 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) const struct tgsi_dst_register *dst; unsigned i, c, k, mask; - dst = &insn->FullDstRegisters[0].DstRegister; + dst = &insn->Dst[0].Register; mask = dst->WriteMask; if (dst->File == TGSI_FILE_TEMPORARY) @@ -1788,12 +2288,12 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) } for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { - src = &insn->FullSrcRegisters[i]; + src = &insn->Src[i]; - if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) + if (src->Register.File == TGSI_FILE_TEMPORARY) reg = pc->temp; else - if (src->SrcRegister.File == TGSI_FILE_INPUT) + if (src->Register.File == TGSI_FILE_INPUT) reg = pc->attr; else continue; @@ -1803,12 +2303,9 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - k = tgsi_util_get_full_src_register_extswizzle(src, c); + k = tgsi_util_get_full_src_register_swizzle(src, c); - if (k > TGSI_EXTSWIZZLE_W) - continue; - - reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr; + reg[src->Register.Index * 4 + k].acc = pc->insn_nr; } } } @@ -1868,13 +2365,13 @@ static struct nv50_reg * tgsi_broadcast_dst(struct nv50_pc *pc, const struct tgsi_full_dst_register *fd, unsigned mask) { - if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) { - int c = ffs(~mask & fd->DstRegister.WriteMask); + if (fd->Register.File == TGSI_FILE_TEMPORARY) { + int c = ffs(~mask & fd->Register.WriteMask); if (c) return tgsi_dst(pc, c - 1, fd); } else { - int c = ffs(fd->DstRegister.WriteMask) - 1; - if ((1 << c) == fd->DstRegister.WriteMask) + int c = ffs(fd->Register.WriteMask) - 1; + if ((1 << c) == fd->Register.WriteMask) return tgsi_dst(pc, c, fd); } @@ -1888,7 +2385,7 @@ static unsigned nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, unsigned rdep[4]) { - const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0]; + const struct tgsi_full_dst_register *fd = &insn->Dst[0]; const struct tgsi_full_src_register *fs; unsigned i, deqs = 0; @@ -1899,9 +2396,9 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, unsigned chn, mask = nv50_tgsi_src_mask(insn, i); boolean neg_supp = negate_supported(insn, i); - fs = &insn->FullSrcRegisters[i]; - if (fs->SrcRegister.File != fd->DstRegister.File || - fs->SrcRegister.Index != fd->DstRegister.Index) + fs = &insn->Src[i]; + if (fs->Register.File != fd->Register.File || + fs->Register.Index != fd->Register.Index) continue; for (chn = 0; chn < 4; ++chn) { @@ -1909,11 +2406,10 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, if (!(mask & (1 << chn))) /* src is not read */ continue; - c = tgsi_util_get_full_src_register_extswizzle(fs, chn); + c = tgsi_util_get_full_src_register_swizzle(fs, chn); s = tgsi_util_get_full_src_register_sign_mode(fs, chn); - if (c > TGSI_EXTSWIZZLE_W || - !(fd->DstRegister.WriteMask & (1 << c))) + if (!(fd->Register.WriteMask & (1 << c))) continue; /* no danger if src is copied to TEMP first */ @@ -1937,7 +2433,7 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) const struct tgsi_full_dst_register *fd; unsigned i, deqs, rdep[4], m[4]; - fd = &tok->FullInstruction.FullDstRegisters[0]; + fd = &tok->FullInstruction.Dst[0]; deqs = nv50_tgsi_scan_swizzle(&insn, rdep); if (is_scalar_op(insn.Instruction.Opcode)) { @@ -1956,10 +2452,10 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) for (i = 0; i < 4; ++i) { assert(pc->r_dst[m[i]] == NULL); - insn.FullDstRegisters[0].DstRegister.WriteMask = - fd->DstRegister.WriteMask & (1 << m[i]); + insn.Dst[0].Register.WriteMask = + fd->Register.WriteMask & (1 << m[i]); - if (!insn.FullDstRegisters[0].DstRegister.WriteMask) + if (!insn.Dst[0].Register.WriteMask) continue; if (deqs & (1 << i)) @@ -2009,6 +2505,23 @@ load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) emit_interp(pc, reg, iv, mode); } +/* The face input is always at v[255] (varying space), with a + * value of 0 for back-facing, and 0xffffffff for front-facing. + */ +static void +load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a) +{ + struct nv50_reg *one = alloc_immd(pc, 1.0f); + + assert(a->rhw == -1); + alloc_reg(pc, a); /* do this before rhw is set */ + a->rhw = 255; + load_interpolant(pc, a); + emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND); + + FREE(one); +} + static boolean nv50_program_tx_prep(struct nv50_pc *pc) { @@ -2040,8 +2553,8 @@ nv50_program_tx_prep(struct nv50_pc *pc) unsigned si, last, first, mode; d = &tp.FullToken.FullDeclaration; - first = d->DeclarationRange.First; - last = d->DeclarationRange.Last; + first = d->Range.First; + last = d->Range.Last; switch (d->Declaration.File) { case TGSI_FILE_TEMPORARY: @@ -2051,8 +2564,8 @@ nv50_program_tx_prep(struct nv50_pc *pc) p->type == PIPE_SHADER_FRAGMENT) break; - si = d->Semantic.SemanticIndex; - switch (d->Semantic.SemanticName) { + si = d->Semantic.Index; + switch (d->Semantic.Name) { case TGSI_SEMANTIC_BCOLOR: p->cfg.two_side[si].hw = first; if (p->cfg.io_nr > first) @@ -2098,8 +2611,8 @@ nv50_program_tx_prep(struct nv50_pc *pc) pc->interp_mode[i] = mode; } break; + case TGSI_FILE_ADDRESS: case TGSI_FILE_CONSTANT: - break; case TGSI_FILE_SAMPLER: break; default: @@ -2130,7 +2643,7 @@ nv50_program_tx_prep(struct nv50_pc *pc) for (i = 0, rid = 0; i < pc->result_nr; ++i) { p->cfg.io[i].hw = rid; - p->cfg.io[i].id_vp = i; + p->cfg.io[i].id = i; for (c = 0; c < 4; ++c) { int n = i * 4 + c; @@ -2153,6 +2666,8 @@ nv50_program_tx_prep(struct nv50_pc *pc) int rid, aid; unsigned n = 0, m = pc->attr_nr - flat_nr; + pc->allow32 = TRUE; + int base = (TGSI_SEMANTIC_POSITION == p->info.input_semantic_name[0]) ? 0 : 1; @@ -2160,14 +2675,12 @@ nv50_program_tx_prep(struct nv50_pc *pc) * the lower hardware IDs, so sort them: */ for (i = 0; i < pc->attr_nr; i++) { - if (pc->interp_mode[i] == INTERP_FLAT) { - p->cfg.io[m].id_vp = i + base; - p->cfg.io[m++].id_fp = i; - } else { + if (pc->interp_mode[i] == INTERP_FLAT) + p->cfg.io[m++].id = i; + else { if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) p->cfg.io[n].linear = TRUE; - p->cfg.io[n].id_vp = i + base; - p->cfg.io[n++].id_fp = i; + p->cfg.io[n++].id = i; } } @@ -2179,7 +2692,13 @@ nv50_program_tx_prep(struct nv50_pc *pc) for (n = 0; n < pc->attr_nr; ++n) { p->cfg.io[n].hw = rid = aid; - i = p->cfg.io[n].id_fp; + i = p->cfg.io[n].id; + + if (p->info.input_semantic_name[n] == + TGSI_SEMANTIC_FACE) { + load_frontfacing(pc, &pc->attr[i * 4]); + continue; + } for (c = 0; c < 4; ++c) { if (!pc->attr[i * 4 + c].acc) @@ -2213,8 +2732,8 @@ nv50_program_tx_prep(struct nv50_pc *pc) for (i = 0; i < pc->attr_nr; i++) { ubyte si, sn; - sn = p->info.input_semantic_name[p->cfg.io[i].id_fp]; - si = p->info.input_semantic_index[p->cfg.io[i].id_fp]; + sn = p->info.input_semantic_name[p->cfg.io[i].id]; + si = p->info.input_semantic_index[p->cfg.io[i].id]; if (sn == TGSI_SEMANTIC_COLOR) { p->cfg.two_side[si] = p->cfg.io[i]; @@ -2237,6 +2756,12 @@ nv50_program_tx_prep(struct nv50_pc *pc) pc->result[i].rhw = rid++; if (p->info.writes_z) pc->result[2].rhw = rid; + + p->cfg.high_result = rid; + + /* separate/different colour results for MRTs ? */ + if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) + p->cfg.regs[2] |= 1; } if (pc->immd_nr) { @@ -2291,6 +2816,8 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; + pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; + assert(pc->addr_nr <= 2); p->cfg.high_temp = 4; @@ -2359,15 +2886,81 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) ctor_reg(&pc->param[rid], P_CONST, i, rid); } + if (pc->addr_nr) { + pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); + if (!pc->addr) + return FALSE; + } + for (i = 0; i < NV50_SU_MAX_ADDR; ++i) + ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1); + return TRUE; } +static void +nv50_fp_move_results(struct nv50_pc *pc) +{ + struct nv50_reg reg; + unsigned i; + + ctor_reg(®, P_TEMP, -1, -1); + + for (i = 0; i < pc->result_nr * 4; ++i) { + if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) + continue; + if (pc->result[i].rhw != pc->result[i].hw) { + reg.hw = pc->result[i].rhw; + emit_mov(pc, ®, &pc->result[i]); + } + } +} + +static void +nv50_program_fixup_insns(struct nv50_pc *pc) +{ + struct nv50_program_exec *e, **bra_list; + unsigned i, n, pos; + + bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); + + /* Collect branch instructions, we need to adjust their offsets + * when converting 32 bit instructions to 64 bit ones + */ + for (n = 0, e = pc->p->exec_head; e; e = e->next) + if (e->param.index >= 0 && !e->param.mask) + bra_list[n++] = e; + + /* last instruction must be long so it can have the exit bit set */ + if (!is_long(pc->p->exec_tail)) + convert_to_long(pc, pc->p->exec_tail); + /* set exit bit */ + pc->p->exec_tail->inst[1] |= 1; + + /* !immd on exit insn simultaneously means !join */ + assert(!is_immd(pc->p->exec_head)); + assert(!is_immd(pc->p->exec_tail)); + + /* Make sure we don't have any single 32 bit instructions. */ + for (e = pc->p->exec_head, pos = 0; e; e = e->next) { + pos += is_long(e) ? 2 : 1; + + if ((pos & 1) && (!e->next || is_long(e->next))) { + for (i = 0; i < n; ++i) + if (bra_list[i]->param.index >= pos) + bra_list[i]->param.index += 1; + convert_to_long(pc, e); + ++pos; + } + } + + FREE(bra_list); +} + static boolean nv50_program_tx(struct nv50_program *p) { struct tgsi_parse_context parse; struct nv50_pc *pc; - unsigned k; boolean ret; pc = CALLOC_STRUCT(nv50_pc); @@ -2405,48 +2998,10 @@ nv50_program_tx(struct nv50_program *p) } } - if (p->type == PIPE_SHADER_FRAGMENT) { - struct nv50_reg out; - ctor_reg(&out, P_TEMP, -1, -1); - - for (k = 0; k < pc->result_nr * 4; k++) { - if (pc->result[k].rhw == -1) - continue; - if (pc->result[k].hw != pc->result[k].rhw) { - out.hw = pc->result[k].rhw; - emit_mov(pc, &out, &pc->result[k]); - } - if (pc->p->cfg.high_result < (pc->result[k].rhw + 1)) - pc->p->cfg.high_result = pc->result[k].rhw + 1; - } - } - - /* look for single half instructions and make them long */ - struct nv50_program_exec *e, *e_prev; - - for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) { - if (!is_long(e)) - k++; - - if (!e->next || is_long(e->next)) { - if (k & 1) - convert_to_long(pc, e); - k = 0; - } - - if (e->next) - e_prev = e; - } - - if (!is_long(pc->p->exec_tail)) { - /* this may occur if moving FP results */ - assert(e_prev && !is_long(e_prev)); - convert_to_long(pc, e_prev); - convert_to_long(pc, pc->p->exec_tail); - } + if (pc->p->type == PIPE_SHADER_FRAGMENT) + nv50_fp_move_results(pc); - assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head)); - pc->p->exec_tail->inst[1] |= 0x00000001; + nv50_program_fixup_insns(pc); p->param_nr = pc->param_nr * 4; p->immd_nr = pc->immd_nr * 4; @@ -2513,7 +3068,7 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) p->immd_nr, NV50_CB_PMISC); } - assert(p->param_nr <= 128); + assert(p->param_nr <= 512); if (p->param_nr) { unsigned cb; @@ -2534,11 +3089,8 @@ static void nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) { struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; struct nv50_program_exec *e; - struct nouveau_stateobj *so; - const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; - unsigned start, count, *up, *ptr; + uint32_t *up, i; boolean upload = FALSE; if (!p->bo) { @@ -2553,21 +3105,37 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) if (!upload) return; - for (e = p->exec_head; e; e = e->next) { + up = MALLOC(p->exec_size * 4); + + for (i = 0, e = p->exec_head; e; e = e->next) { unsigned ei, ci, bs; - if (e->param.index < 0) - continue; - bs = (e->inst[1] >> 22) & 0x07; - assert(bs < 2); - ei = e->param.shift >> 5; - ci = e->param.index; - if (bs == 0) - ci += p->data[bs]->start; + if (e->param.index >= 0 && e->param.mask) { + bs = (e->inst[1] >> 22) & 0x07; + assert(bs < 2); + ei = e->param.shift >> 5; + ci = e->param.index; + if (bs == 0) + ci += p->data[bs]->start; - e->inst[ei] &= ~e->param.mask; - e->inst[ei] |= (ci << e->param.shift); + e->inst[ei] &= ~e->param.mask; + e->inst[ei] |= (ci << e->param.shift); + } else + if (e->param.index >= 0) { + /* zero mask means param is a jump/branch offset */ + assert(!(e->param.index & 1)); + /* seem to be 8 byte steps */ + ei = (e->param.index >> 1) + 0 /* START_ID */; + + e->inst[0] &= 0xf0000fff; + e->inst[0] |= ei << 12; + } + + up[i++] = e->inst[0]; + if (is_long(e)) + up[i++] = e->inst[1]; } + assert(i == p->exec_size); if (p->data[0]) p->data_start[0] = p->data[0]->start; @@ -2580,45 +3148,12 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) NOUVEAU_ERR("0x%08x\n", e->inst[1]); } #endif - - up = ptr = MALLOC(p->exec_size * 4); - for (e = p->exec_head; e; e = e->next) { - *(ptr++) = e->inst[0]; - if (is_long(e)) - *(ptr++) = e->inst[1]; - } - - so = so_new(4,2); - so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); - so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); - - start = 0; count = p->exec_size; - while (count) { - struct nouveau_channel *chan = nv50->screen->base.channel; - unsigned nr; - - so_emit(chan, so); - - nr = MIN2(count, 2047); - nr = MIN2(chan->pushbuf->remaining, nr); - if (chan->pushbuf->remaining < (nr + 3)) { - FIRE_RING(chan); - continue; - } - - BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); - OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); - BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); - OUT_RINGp (chan, up + start, nr); - - start += nr; - count -= nr; - } + nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM, + NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, + up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0, + 0, 0, p->exec_size * 4, 1, 1); FREE(up); - so_ref(NULL, &so); } void @@ -2700,15 +3235,15 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) struct nv50_program *vp = nv50->vertprog; unsigned i, c, m = base; - /* XXX: This can't work correctly in all cases yet, we either - * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has - * to be per FP input instead of per VP output + /* XXX: this might not work correctly in all cases yet - we'll + * just assume that an FP generic input that is not written in + * the VP is PointCoord. */ memset(pntc, 0, 8 * sizeof(uint32_t)); for (i = 0; i < fp->cfg.io_nr; i++) { uint8_t sn, si; - uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp; + uint8_t j, k = fp->cfg.io[i].id; unsigned n = popcnt4(fp->cfg.io[i].mask); if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { @@ -2716,10 +3251,16 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) continue; } - sn = vp->info.input_semantic_name[j]; - si = vp->info.input_semantic_index[j]; + for (j = 0; j < vp->info.num_outputs; ++j) { + sn = vp->info.output_semantic_name[j]; + si = vp->info.output_semantic_index[j]; - if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) { + if (sn == fp->info.input_semantic_name[k] && + si == fp->info.input_semantic_index[k]) + break; + } + + if (j < vp->info.num_outputs) { ubyte mode = nv50->rasterizer->pipe.sprite_coord_mode[si]; @@ -2807,20 +3348,24 @@ nv50_linkage_validate(struct nv50_context *nv50) reg[0] += m - 4; /* adjust FFC0 id */ reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ - i = 0; - if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) - i = 1; - for (; i < fp->cfg.io_nr; i++) { - ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp]; - ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp]; - - n = fp->cfg.io[i].id_vp; - if (n >= vp->cfg.io_nr || - vp->info.output_semantic_name[n] != sn || - vp->info.output_semantic_index[n] != si) - vpo = &dummy; - else - vpo = &vp->cfg.io[n]; + for (i = 0; i < fp->cfg.io_nr; i++) { + ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id]; + ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id]; + + /* position must be mapped first */ + assert(i == 0 || sn != TGSI_SEMANTIC_POSITION); + + /* maybe even remove these from cfg.io */ + if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE) + continue; + + /* VP outputs and vp->cfg.io are in the same order */ + for (n = 0; n < vp->info.num_outputs; ++n) { + if (vp->info.output_semantic_name[n] == sn && + vp->info.output_semantic_index[n] == si) + break; + } + vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy; m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index d78dee083f..255c7c737e 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -17,8 +17,7 @@ struct nv50_program_exec { struct nv50_sreg4 { uint8_t hw; - uint8_t id_vp; - uint8_t id_fp; + uint8_t id; /* tgsi index, nv50 needs them sorted: flat ones last */ uint8_t mask; boolean linear; diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index 3b08e1b89f..e1b2f11239 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -35,8 +35,14 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, { if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) { switch (format) { + case PIPE_FORMAT_X8R8G8B8_UNORM: case PIPE_FORMAT_A8R8G8B8_UNORM: case PIPE_FORMAT_R5G6B5_UNORM: + case PIPE_FORMAT_R16G16B16A16_SNORM: + case PIPE_FORMAT_R16G16B16A16_UNORM: + case PIPE_FORMAT_R32G32B32A32_FLOAT: + case PIPE_FORMAT_R16G16_SNORM: + case PIPE_FORMAT_R16G16_UNORM: return TRUE; default: break; @@ -55,6 +61,9 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, } else { switch (format) { case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_X8R8G8B8_UNORM: + case PIPE_FORMAT_A8R8G8B8_SRGB: + case PIPE_FORMAT_X8R8G8B8_SRGB: case PIPE_FORMAT_A1R5G5B5_UNORM: case PIPE_FORMAT_A4R4G4B4_UNORM: case PIPE_FORMAT_R5G6B5_UNORM: @@ -66,6 +75,13 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, case PIPE_FORMAT_DXT1_RGBA: case PIPE_FORMAT_DXT3_RGBA: case PIPE_FORMAT_DXT5_RGBA: + case PIPE_FORMAT_Z24S8_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_R16G16B16A16_SNORM: + case PIPE_FORMAT_R16G16B16A16_UNORM: + case PIPE_FORMAT_R32G32B32A32_FLOAT: + case PIPE_FORMAT_R16G16_SNORM: + case PIPE_FORMAT_R16G16_UNORM: return TRUE; default: break; @@ -216,7 +232,16 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) tesla_class = NV54TCL; break; case 0xa0: - tesla_class = NVA0TCL; + switch (chipset) { + case 0xa0: + case 0xaa: + case 0xac: + tesla_class = NVA0TCL; + break; + default: + tesla_class = 0x8597; + break; + } break; default: NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", chipset); @@ -224,12 +249,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) return NULL; } - if (tesla_class == 0) { - NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset); - nv50_screen_destroy(pscreen); - return NULL; - } - ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class, &screen->tesla); if (ret) { @@ -290,6 +309,12 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) so_method(so, screen->tesla, 0x121c, 1); so_data (so, 1); + /* activate all 32 lanes (threads) in a warp */ + so_method(so, screen->tesla, 0x19a0, 1); + so_data (so, 0x2); + so_method(so, screen->tesla, 0x1400, 1); + so_data (so, 0xf); + so_method(so, screen->tesla, 0x13bc, 1); so_data (so, 0x54); /* origin is top left (set to 1 for bottom left) */ @@ -299,7 +324,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) so_data (so, 8); /* constant buffers for immediates and VP/FP parameters */ - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4, + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (32 * 4) * 4, &screen->constbuf_misc[0]); if (ret) { nv50_screen_destroy(pscreen); @@ -307,7 +332,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) } for (i = 0; i < 2; i++) { - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4, + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (128 * 4) * 4, &screen->constbuf_parm[i]); if (ret) { nv50_screen_destroy(pscreen); @@ -316,8 +341,8 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) } if (nouveau_resource_init(&screen->immd_heap[0], 0, 128) || - nouveau_resource_init(&screen->parm_heap[0], 0, 128) || - nouveau_resource_init(&screen->parm_heap[1], 0, 128)) + nouveau_resource_init(&screen->parm_heap[0], 0, 512) || + nouveau_resource_init(&screen->parm_heap[1], 0, 512)) { NOUVEAU_ERR("Error initialising constant buffers.\n"); nv50_screen_destroy(pscreen); @@ -338,7 +363,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_PMISC << 16) | 0x00000800); + so_data (so, (NV50_CB_PMISC << 16) | 0x00000200); so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1); so_data (so, 0x00000001 | (NV50_CB_PMISC << 12)); so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1); @@ -362,48 +387,31 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1); so_data (so, 0x00000131 | (NV50_CB_PFP << 12)); - /* Texture sampler/image unit setup - we abuse the constant buffer - * upload mechanism for the moment to upload data to the tex config - * blocks. At some point we *may* want to go the NVIDIA way of doing - * things? - */ - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tic); + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tic); if (ret) { nv50_screen_destroy(pscreen); return NULL; } - so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); - so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_TIC << 16) | 0x0800); so_method(so, screen->tesla, NV50TCL_TIC_ADDRESS_HIGH, 3); so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, 0x00000800); + so_data (so, 0x000007ff); - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tsc); + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tsc); if (ret) { nv50_screen_destroy(pscreen); return NULL; } - so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); - so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_TSC << 16) | 0x0800); so_method(so, screen->tesla, NV50TCL_TSC_ADDRESS_HIGH, 3); so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, 0x00000800); + so_data (so, 0x00000000); /* Vertex array limits - max them out */ diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index 81fa3e34c5..07318f2394 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -146,6 +146,7 @@ nv50_sampler_state_create(struct pipe_context *pipe, (wrap_mode(cso->wrap_r) << 6)); switch (cso->mag_img_filter) { + case PIPE_TEX_FILTER_ANISO: case PIPE_TEX_FILTER_LINEAR: tsc[1] |= NV50TSC_1_1_MAGF_LINEAR; break; @@ -156,6 +157,7 @@ nv50_sampler_state_create(struct pipe_context *pipe, } switch (cso->min_img_filter) { + case PIPE_TEX_FILTER_ANISO: case PIPE_TEX_FILTER_LINEAR: tsc[1] |= NV50TSC_1_1_MINF_LINEAR; break; @@ -183,21 +185,15 @@ nv50_sampler_state_create(struct pipe_context *pipe, else if (cso->max_anisotropy >= 12.0) tsc[0] |= (6 << 20); - else - if (cso->max_anisotropy >= 10.0) - tsc[0] |= (5 << 20); - else - if (cso->max_anisotropy >= 8.0) - tsc[0] |= (4 << 20); - else - if (cso->max_anisotropy >= 6.0) - tsc[0] |= (3 << 20); - else - if (cso->max_anisotropy >= 4.0) - tsc[0] |= (2 << 20); - else - if (cso->max_anisotropy >= 2.0) - tsc[0] |= (1 << 20); + else { + tsc[0] |= (int)(cso->max_anisotropy * 0.5f) << 20; + + if (cso->max_anisotropy >= 4.0) + tsc[1] |= NV50TSC_1_1_UNKN_ANISO_35; + else + if (cso->max_anisotropy >= 2.0) + tsc[1] |= NV50TSC_1_1_UNKN_ANISO_15; + } if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) { tsc[0] |= (1 << 8); @@ -652,9 +648,9 @@ nv50_init_state_functions(struct nv50_context *nv50) nv50->pipe.delete_blend_state = nv50_blend_state_delete; nv50->pipe.create_sampler_state = nv50_sampler_state_create; - nv50->pipe.bind_sampler_states = nv50_sampler_state_bind; + nv50->pipe.bind_fragment_sampler_states = nv50_sampler_state_bind; nv50->pipe.delete_sampler_state = nv50_sampler_state_delete; - nv50->pipe.set_sampler_textures = nv50_set_sampler_texture; + nv50->pipe.set_fragment_sampler_textures = nv50_set_sampler_texture; nv50->pipe.create_rasterizer_state = nv50_rasterizer_state_create; nv50->pipe.bind_rasterizer_state = nv50_rasterizer_state_bind; diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index 5a3559ed18..c871acaab8 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -23,6 +23,12 @@ #include "nv50_context.h" #include "nouveau/nouveau_stateobj.h" +#define NV50_CBUF_FORMAT_CASE(n) \ + case PIPE_FORMAT_##n: so_data(so, NV50TCL_RT_FORMAT_##n); break + +#define NV50_ZETA_FORMAT_CASE(n) \ + case PIPE_FORMAT_##n: so_data(so, NV50TCL_ZETA_FORMAT_##n); break + static void nv50_state_validate_fb(struct nv50_context *nv50) { @@ -31,6 +37,15 @@ nv50_state_validate_fb(struct nv50_context *nv50) struct pipe_framebuffer_state *fb = &nv50->framebuffer; unsigned i, w, h, gw = 0; + /* Set nr of active RTs and select RT for each colour output. + * FP result 0 always goes to RT[0], bits 4 - 6 are ignored. + * Ambiguous assignment results in no rendering (no DATA_ERROR). + */ + so_method(so, tesla, 0x121c, 1); + so_data (so, fb->nr_cbufs | + (0 << 4) | (1 << 7) | (2 << 10) | (3 << 13) | + (4 << 16) | (5 << 19) | (6 << 22) | (7 << 25)); + for (i = 0; i < fb->nr_cbufs; i++) { struct pipe_texture *pt = fb->cbufs[i]->texture; struct nouveau_bo *bo = nv50_miptree(pt)->base.bo; @@ -54,12 +69,14 @@ nv50_state_validate_fb(struct nv50_context *nv50) so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); switch (fb->cbufs[i]->format) { - case PIPE_FORMAT_A8R8G8B8_UNORM: - so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM); - break; - case PIPE_FORMAT_R5G6B5_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM); - break; + NV50_CBUF_FORMAT_CASE(A8R8G8B8_UNORM); + NV50_CBUF_FORMAT_CASE(X8R8G8B8_UNORM); + NV50_CBUF_FORMAT_CASE(R5G6B5_UNORM); + NV50_CBUF_FORMAT_CASE(R16G16B16A16_SNORM); + NV50_CBUF_FORMAT_CASE(R16G16B16A16_UNORM); + NV50_CBUF_FORMAT_CASE(R32G32B32A32_FLOAT); + NV50_CBUF_FORMAT_CASE(R16G16_SNORM); + NV50_CBUF_FORMAT_CASE(R16G16_UNORM); default: NOUVEAU_ERR("AIIII unknown format %s\n", pf_name(fb->cbufs[i]->format)); @@ -93,18 +110,10 @@ nv50_state_validate_fb(struct nv50_context *nv50) so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); switch (fb->zsbuf->format) { - case PIPE_FORMAT_Z32_FLOAT: - so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT); - break; - case PIPE_FORMAT_Z24S8_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM); - break; - case PIPE_FORMAT_X8Z24_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM); - break; - case PIPE_FORMAT_S8Z24_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM); - break; + NV50_ZETA_FORMAT_CASE(S8Z24_UNORM); + NV50_ZETA_FORMAT_CASE(X8Z24_UNORM); + NV50_ZETA_FORMAT_CASE(Z24S8_UNORM); + NV50_ZETA_FORMAT_CASE(Z32_FLOAT); default: NOUVEAU_ERR("AIIII unknown format %s\n", pf_name(fb->zsbuf->format)); @@ -121,6 +130,9 @@ nv50_state_validate_fb(struct nv50_context *nv50) so_data (so, fb->zsbuf->width); so_data (so, fb->zsbuf->height); so_data (so, 0x00010001); + } else { + so_method(so, tesla, 0x1538, 1); + so_data (so, 0); } so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2); @@ -189,7 +201,8 @@ nv50_state_emit(struct nv50_context *nv50) so_emit(chan, nv50->state.vertprog); if (nv50->state.dirty & NV50_NEW_FRAGPROG) so_emit(chan, nv50->state.fragprog); - if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG)) + if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG | + NV50_NEW_RASTERIZER)) so_emit(chan, nv50->state.programs); if (nv50->state.dirty & NV50_NEW_RASTERIZER) so_emit(chan, nv50->state.rast); @@ -219,6 +232,9 @@ nv50_state_flush_notify(struct nouveau_channel *chan) { struct nv50_context *nv50 = chan->user_private; + if (nv50->state.tic_upload && !(nv50->dirty & NV50_NEW_TEXTURE)) + so_emit(chan, nv50->state.tic_upload); + so_emit_reloc_markers(chan, nv50->state.fb); so_emit_reloc_markers(chan, nv50->state.vertprog); so_emit_reloc_markers(chan, nv50->state.fragprog); @@ -230,6 +246,7 @@ boolean nv50_state_validate(struct nv50_context *nv50) { struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_grobj *eng2d = nv50->screen->eng2d; struct nouveau_stateobj *so; unsigned i; @@ -248,7 +265,8 @@ nv50_state_validate(struct nv50_context *nv50) if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB)) nv50_fragprog_validate(nv50); - if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG)) + if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG | + NV50_NEW_RASTERIZER)) nv50_linkage_validate(nv50); if (nv50->dirty & NV50_NEW_RASTERIZER) @@ -269,7 +287,7 @@ nv50_state_validate(struct nv50_context *nv50) so = so_new(33, 0); so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32); for (i = 0; i < 32; i++) - so_data(so, nv50->stipple.stipple[i]); + so_data(so, util_bswap32(nv50->stipple.stipple[i])); so_ref(so, &nv50->state.stipple); so_ref(NULL, &so); } @@ -312,7 +330,7 @@ scissor_uptodate: goto viewport_uptodate; nv50->state.viewport_bypass = bypass; - so = so_new(12, 0); + so = so_new(14, 0); if (!bypass) { so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3); so_data (so, fui(nv50->viewport.translate[0])); @@ -325,12 +343,21 @@ scissor_uptodate: so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1); so_data (so, 1); + /* 0x0000 = remove whole primitive only (xyz) + * 0x1018 = remove whole primitive only (xy), clamp z + * 0x1080 = clip primitive (xyz) + * 0x1098 = clip primitive (xy), clamp z + */ + so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1); + so_data (so, 0x1080); /* no idea what 0f90 does */ so_method(so, tesla, 0x0f90, 1); so_data (so, 0); } else { so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1); so_data (so, 0); + so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1); + so_data (so, 0x0000); so_method(so, tesla, 0x0f90, 1); so_data (so, 1); } @@ -342,15 +369,25 @@ scissor_uptodate: viewport_uptodate: if (nv50->dirty & NV50_NEW_SAMPLER) { - int i; - - so = so_new(nv50->sampler_nr * 8 + 3, 0); - so_method(so, tesla, NV50TCL_CB_ADDR, 1); - so_data (so, NV50_CB_TSC); - so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000, - nv50->sampler_nr * 8); - for (i = 0; i < nv50->sampler_nr; i++) + unsigned i; + + so = so_new(nv50->sampler_nr * 9 + 23 + 4, 2); + + nv50_so_init_sifc(nv50, so, nv50->screen->tsc, NOUVEAU_BO_VRAM, + nv50->sampler_nr * 8 * 4); + + for (i = 0; i < nv50->sampler_nr; i++) { + if (!nv50->sampler[i]) + continue; + so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8); so_datap (so, nv50->sampler[i]->tsc, 8); + } + + so_method(so, tesla, 0x1440, 1); /* sync SIFC */ + so_data (so, 0); + so_method(so, tesla, 0x1334, 1); /* flush TSC */ + so_data (so, 0); + so_ref(so, &nv50->state.tsc_upload); so_ref(NULL, &so); } @@ -368,3 +405,33 @@ viewport_uptodate: return TRUE; } +void nv50_so_init_sifc(struct nv50_context *nv50, + struct nouveau_stateobj *so, + struct nouveau_bo *bo, unsigned reloc, unsigned size) +{ + struct nouveau_grobj *eng2d = nv50->screen->eng2d; + + so_method(so, eng2d, NV50_2D_DST_FORMAT, 2); + so_data (so, NV50_2D_DST_FORMAT_R8_UNORM); + so_data (so, 1); + so_method(so, eng2d, NV50_2D_DST_PITCH, 5); + so_data (so, 262144); + so_data (so, 65536); + so_data (so, 1); + so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_LOW, 0, 0); + so_method(so, eng2d, NV50_2D_SIFC_UNK0800, 2); + so_data (so, 0); + so_data (so, NV50_2D_SIFC_FORMAT_R8_UNORM); + so_method(so, eng2d, NV50_2D_SIFC_WIDTH, 10); + so_data (so, size); + so_data (so, 1); + so_data (so, 0); + so_data (so, 1); + so_data (so, 0); + so_data (so, 1); + so_data (so, 0); + so_data (so, 0); + so_data (so, 0); + so_data (so, 0); +} diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c index 033cb50c11..417d367942 100644 --- a/src/gallium/drivers/nv50/nv50_tex.c +++ b/src/gallium/drivers/nv50/nv50_tex.c @@ -25,109 +25,115 @@ #include "nouveau/nouveau_stateobj.h" +#define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f) \ +{ \ + PIPE_FORMAT_##pf, \ + NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ + NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ + NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ + NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ + NV50TIC_0_0_FMT_##f \ +} + +#define _(pf, t, cr, cg, cb, ca, f) _MIXED(pf, t, t, t, t, cr, cg, cb, ca, f) + +struct nv50_texture_format { + enum pipe_format pf; + uint32_t hw; +}; + +#define NV50_TEX_FORMAT_LIST_SIZE \ + (sizeof(nv50_tex_format_list) / sizeof(struct nv50_texture_format)) + +static const struct nv50_texture_format nv50_tex_format_list[] = +{ + _(A8R8G8B8_UNORM, UNORM, C2, C1, C0, C3, 8_8_8_8), + _(A8R8G8B8_SRGB, UNORM, C2, C1, C0, C3, 8_8_8_8), + _(X8R8G8B8_UNORM, UNORM, C2, C1, C0, ONE, 8_8_8_8), + _(X8R8G8B8_SRGB, UNORM, C2, C1, C0, ONE, 8_8_8_8), + _(A1R5G5B5_UNORM, UNORM, C2, C1, C0, C3, 1_5_5_5), + _(A4R4G4B4_UNORM, UNORM, C2, C1, C0, C3, 4_4_4_4), + + _(R5G6B5_UNORM, UNORM, C2, C1, C0, ONE, 5_6_5), + + _(L8_UNORM, UNORM, C0, C0, C0, ONE, 8), + _(A8_UNORM, UNORM, ZERO, ZERO, ZERO, C0, 8), + _(I8_UNORM, UNORM, C0, C0, C0, C0, 8), + + _(A8L8_UNORM, UNORM, C0, C0, C0, C1, 8_8), + + _(DXT1_RGB, UNORM, C0, C1, C2, ONE, DXT1), + _(DXT1_RGBA, UNORM, C0, C1, C2, C3, DXT1), + _(DXT3_RGBA, UNORM, C0, C1, C2, C3, DXT3), + _(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5), + + _MIXED(Z24S8_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8), + + _(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16), + _(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16), + _(R32G32B32A32_FLOAT, FLOAT, C0, C1, C2, C3, 32_32_32_32), + + _(R16G16_SNORM, SNORM, C0, C1, ZERO, ONE, 16_16), + _(R16G16_UNORM, UNORM, C0, C1, ZERO, ONE, 16_16), + + _MIXED(Z32_FLOAT, FLOAT, UINT, UINT, UINT, C0, C0, C0, ONE, 32_DEPTH) + +}; + +#undef _ +#undef _MIXED + static int nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so, struct nv50_miptree *mt, int unit) { - switch (mt->base.base.format) { - case PIPE_FORMAT_A8R8G8B8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8_8_8_8); - break; - case PIPE_FORMAT_A1R5G5B5_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_1_5_5_5); - break; - case PIPE_FORMAT_A4R4G4B4_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_4_4_4_4); - break; - case PIPE_FORMAT_R5G6B5_UNORM: - so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_5_6_5); - break; - case PIPE_FORMAT_L8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8); - break; - case PIPE_FORMAT_A8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_ZERO | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_ZERO | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_ZERO | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8); - break; - case PIPE_FORMAT_I8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8); - break; - case PIPE_FORMAT_A8L8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C1 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8_8); - break; - case PIPE_FORMAT_DXT1_RGB: - so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT1); + unsigned i; + uint32_t mode; + + for (i = 0; i < NV50_TEX_FORMAT_LIST_SIZE; i++) + if (nv50_tex_format_list[i].pf == mt->base.base.format) + break; + if (i == NV50_TEX_FORMAT_LIST_SIZE) + return 1; + + if (nv50->sampler[unit]->normalized) + mode = 0x50001000 | (1 << 31); + else { + mode = 0x50001000 | (7 << 14); + assert(mt->base.base.target == PIPE_TEXTURE_2D); + } + + mode |= ((mt->base.bo->tile_mode & 0x0f) << 22) | + ((mt->base.bo->tile_mode & 0xf0) << 21); + + if (pf_type(mt->base.base.format) == PIPE_FORMAT_TYPE_SRGB) + mode |= 0x0400; + + switch (mt->base.base.target) { + case PIPE_TEXTURE_1D: break; - case PIPE_FORMAT_DXT1_RGBA: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT1); + case PIPE_TEXTURE_2D: + mode |= (1 << 14); break; - case PIPE_FORMAT_DXT3_RGBA: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT3); + case PIPE_TEXTURE_3D: + mode |= (2 << 14); break; - case PIPE_FORMAT_DXT5_RGBA: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT5); + case PIPE_TEXTURE_CUBE: + mode |= (3 << 14); break; default: - return 1; + assert(!"unsupported texture target"); + break; } + so_data (so, nv50_tex_format_list[i].hw); so_reloc(so, mt->base.bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | - NOUVEAU_BO_RD, 0, 0); - if (nv50->sampler[unit]->normalized) - so_data (so, 0xd0005000 | mt->base.bo->tile_mode << 22); - else - so_data (so, 0x5001d000 | mt->base.bo->tile_mode << 22); + NOUVEAU_BO_RD, 0, 0); + so_data (so, mode); so_data (so, 0x00300000); - so_data (so, mt->base.base.width[0]); + so_data (so, mt->base.base.width0 | (1 << 31)); so_data (so, (mt->base.base.last_level << 28) | - (mt->base.base.depth[0] << 16) | mt->base.base.height[0]); + (mt->base.base.depth0 << 16) | mt->base.base.height0); so_data (so, 0x03000000); so_data (so, mt->base.base.last_level << 4); @@ -137,20 +143,24 @@ nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so, void nv50_tex_validate(struct nv50_context *nv50) { + struct nouveau_grobj *eng2d = nv50->screen->eng2d; struct nouveau_grobj *tesla = nv50->screen->tesla; struct nouveau_stateobj *so; - int unit, push; + unsigned i, unit, push; + + push = MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2 + 23 + 6; + so = so_new(nv50->miptree_nr * 9 + push, nv50->miptree_nr * 2 + 2); - push = nv50->miptree_nr * 9 + 2; - push += MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2; + nv50_so_init_sifc(nv50, so, nv50->screen->tic, NOUVEAU_BO_VRAM, + nv50->miptree_nr * 8 * 4); - so = so_new(push, nv50->miptree_nr * 2); - so_method(so, tesla, NV50TCL_CB_ADDR, 1); - so_data (so, NV50_CB_TIC); - for (unit = 0; unit < nv50->miptree_nr; unit++) { + for (i = 0, unit = 0; unit < nv50->miptree_nr; ++unit) { struct nv50_miptree *mt = nv50->miptree[unit]; - so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000, 8); + if (!mt) + continue; + + so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8); if (nv50_tex_construct(nv50, so, mt, unit)) { NOUVEAU_ERR("failed tex validate\n"); so_ref(NULL, &so); @@ -158,17 +168,25 @@ nv50_tex_validate(struct nv50_context *nv50) } so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1); - so_data (so, (unit << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) | - (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | - NV50TCL_SET_SAMPLER_TEX_VALID); + so_data (so, (i++ << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) | + (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | + NV50TCL_SET_SAMPLER_TEX_VALID); } for (; unit < nv50->state.miptree_nr; unit++) { so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1); so_data (so, - (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0); + (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0); } + /* not sure if the following really do what I think: */ + so_method(so, tesla, 0x1440, 1); /* sync SIFC */ + so_data (so, 0); + so_method(so, tesla, 0x1330, 1); /* flush TIC */ + so_data (so, 0); + so_method(so, tesla, 0x1338, 1); /* flush texture caches */ + so_data (so, 0x20); + so_ref(so, &nv50->state.tic_upload); so_ref(NULL, &so); nv50->state.miptree_nr = nv50->miptree_nr; diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h index 207fb039f7..d531e61132 100644 --- a/src/gallium/drivers/nv50/nv50_texture.h +++ b/src/gallium/drivers/nv50/nv50_texture.h @@ -38,18 +38,26 @@ #define NV50TIC_0_0_TYPEA_MASK 0x00038000 #define NV50TIC_0_0_TYPEA_UNORM 0x00010000 #define NV50TIC_0_0_TYPEA_SNORM 0x00008000 +#define NV50TIC_0_0_TYPEA_SINT 0x00018000 +#define NV50TIC_0_0_TYPEA_UINT 0x00020000 #define NV50TIC_0_0_TYPEA_FLOAT 0x00038000 #define NV50TIC_0_0_TYPEB_MASK 0x00007000 #define NV50TIC_0_0_TYPEB_UNORM 0x00002000 #define NV50TIC_0_0_TYPEB_SNORM 0x00001000 +#define NV50TIC_0_0_TYPEB_SINT 0x00003000 +#define NV50TIC_0_0_TYPEB_UINT 0x00004000 #define NV50TIC_0_0_TYPEB_FLOAT 0x00007000 #define NV50TIC_0_0_TYPEG_MASK 0x00000e00 #define NV50TIC_0_0_TYPEG_UNORM 0x00000400 #define NV50TIC_0_0_TYPEG_SNORM 0x00000200 +#define NV50TIC_0_0_TYPEG_SINT 0x00000600 +#define NV50TIC_0_0_TYPEG_UINT 0x00000800 #define NV50TIC_0_0_TYPEG_FLOAT 0x00000e00 #define NV50TIC_0_0_TYPER_MASK 0x000001c0 #define NV50TIC_0_0_TYPER_UNORM 0x00000080 #define NV50TIC_0_0_TYPER_SNORM 0x00000040 +#define NV50TIC_0_0_TYPER_SINT 0x000000c0 +#define NV50TIC_0_0_TYPER_UINT 0x00000100 #define NV50TIC_0_0_TYPER_FLOAT 0x000001c0 #define NV50TIC_0_0_FMT_MASK 0x0000003f #define NV50TIC_0_0_FMT_32_32_32_32 0x00000001 @@ -57,6 +65,7 @@ #define NV50TIC_0_0_FMT_32_32 0x00000004 #define NV50TIC_0_0_FMT_8_8_8_8 0x00000008 #define NV50TIC_0_0_FMT_2_10_10_10 0x00000009 +#define NV50TIC_0_0_FMT_16_16 0x0000000c #define NV50TIC_0_0_FMT_32 0x0000000f #define NV50TIC_0_0_FMT_4_4_4_4 0x00000012 /* #define NV50TIC_0_0_FMT_1_5_5_5 0x00000013 */ @@ -65,12 +74,16 @@ #define NV50TIC_0_0_FMT_8_8 0x00000018 #define NV50TIC_0_0_FMT_16 0x0000001b #define NV50TIC_0_0_FMT_8 0x0000001d +#define NV50TIC_0_0_FMT_5_9_9_9 0x00000020 #define NV50TIC_0_0_FMT_10_11_11 0x00000021 #define NV50TIC_0_0_FMT_DXT1 0x00000024 #define NV50TIC_0_0_FMT_DXT3 0x00000025 #define NV50TIC_0_0_FMT_DXT5 0x00000026 #define NV50TIC_0_0_FMT_RGTC1 0x00000027 #define NV50TIC_0_0_FMT_RGTC2 0x00000028 +#define NV50TIC_0_0_FMT_24_8 0x00000029 +#define NV50TIC_0_0_FMT_32_DEPTH 0x0000002f +#define NV50TIC_0_0_FMT_32_8 0x00000030 #define NV50TIC_0_1_OFFSET_LOW_MASK 0xffffffff #define NV50TIC_0_1_OFFSET_LOW_SHIFT 0 @@ -133,6 +146,8 @@ #define NV50TSC_1_1_MIPF_NEAREST 0x00000080 #define NV50TSC_1_1_MIPF_LINEAR 0x000000c0 #define NV50TSC_1_1_LOD_BIAS_MASK 0x01fff000 +#define NV50TSC_1_1_UNKN_ANISO_15 0x10000000 +#define NV50TSC_1_1_UNKN_ANISO_35 0x18000000 #define NV50TSC_1_2_MIN_LOD_MASK 0x00000f00 #define NV50TSC_1_2_MAX_LOD_MASK 0x00f00000 diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c index bb7731855c..4705f96f57 100644 --- a/src/gallium/drivers/nv50/nv50_transfer.c +++ b/src/gallium/drivers/nv50/nv50_transfer.c @@ -1,6 +1,7 @@ #include "pipe/p_context.h" #include "pipe/p_inlines.h" +#include "util/u_math.h" #include "nv50_context.h" @@ -12,18 +13,21 @@ struct nv50_transfer { int level_pitch; int level_width; int level_height; + int level_depth; int level_x; int level_y; + unsigned nblocksx; + unsigned nblocksy; }; static void nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo, unsigned src_offset, int src_pitch, unsigned src_tile_mode, - int sx, int sy, int sw, int sh, + int sx, int sy, int sw, int sh, int sd, struct nouveau_bo *dst_bo, unsigned dst_offset, int dst_pitch, unsigned dst_tile_mode, - int dx, int dy, int dw, int dh, + int dx, int dy, int dw, int dh, int dd, int cpp, int width, int height, unsigned src_reloc, unsigned dst_reloc) { @@ -51,7 +55,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, OUT_RING (chan, src_tile_mode << 4); OUT_RING (chan, sw * cpp); OUT_RING (chan, sh); - OUT_RING (chan, 1); + OUT_RING (chan, sd); OUT_RING (chan, 0); } @@ -70,7 +74,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, OUT_RING (chan, dst_tile_mode << 4); OUT_RING (chan, dw * cpp); OUT_RING (chan, dh); - OUT_RING (chan, 1); + OUT_RING (chan, dd); OUT_RING (chan, 0); } @@ -114,6 +118,20 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, } } +static INLINE unsigned +get_zslice_offset(unsigned tile_mode, unsigned z, unsigned pitch, unsigned ny) +{ + unsigned tile_h = get_tile_height(tile_mode); + unsigned tile_d = get_tile_depth(tile_mode); + + /* pitch_2d == to next slice within this volume-tile */ + /* pitch_3d == to next slice in next 2D array of blocks */ + unsigned pitch_2d = tile_h * 64; + unsigned pitch_3d = tile_d * align(ny, tile_h) * pitch; + + return (z % tile_d) * pitch_2d + (z / tile_d) * pitch_3d; +} + static struct pipe_transfer * nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt, unsigned face, unsigned level, unsigned zslice, @@ -124,52 +142,58 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt, struct nv50_miptree *mt = nv50_miptree(pt); struct nv50_miptree_level *lvl = &mt->level[level]; struct nv50_transfer *tx; - unsigned image = 0; + unsigned nx, ny, image = 0; int ret; if (pt->target == PIPE_TEXTURE_CUBE) image = face; - else - if (pt->target == PIPE_TEXTURE_3D) - image = zslice; tx = CALLOC_STRUCT(nv50_transfer); if (!tx) return NULL; pipe_texture_reference(&tx->base.texture, pt); - tx->base.format = pt->format; + tx->nblocksx = pf_get_nblocksx(pt->format, u_minify(pt->width0, level)); + tx->nblocksy = pf_get_nblocksy(pt->format, u_minify(pt->height0, level)); tx->base.width = w; tx->base.height = h; - tx->base.block = pt->block; - tx->base.nblocksx = pt->nblocksx[level]; - tx->base.nblocksy = pt->nblocksy[level]; - tx->base.stride = (w * pt->block.size); + tx->base.stride = tx->nblocksx * pf_get_blocksize(pt->format); tx->base.usage = usage; tx->level_pitch = lvl->pitch; - tx->level_width = mt->base.base.width[level]; - tx->level_height = mt->base.base.height[level]; + tx->level_width = u_minify(mt->base.base.width0, level); + tx->level_height = u_minify(mt->base.base.height0, level); + tx->level_depth = u_minify(mt->base.base.depth0, level); tx->level_offset = lvl->image_offset[image]; tx->level_tiling = lvl->tile_mode; - tx->level_x = x; - tx->level_y = y; + tx->level_x = pf_get_nblocksx(pt->format, x); + tx->level_y = pf_get_nblocksy(pt->format, y); ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, - w * pt->block.size * h, &tx->bo); + tx->nblocksy * tx->base.stride, &tx->bo); if (ret) { FREE(tx); return NULL; } - if (usage != PIPE_TRANSFER_WRITE) { + if (pt->target == PIPE_TEXTURE_3D) + tx->level_offset += get_zslice_offset(lvl->tile_mode, zslice, + lvl->pitch, + tx->nblocksy); + + if (usage & PIPE_TRANSFER_READ) { + nx = pf_get_nblocksx(pt->format, tx->base.width); + ny = pf_get_nblocksy(pt->format, tx->base.height); + nv50_transfer_rect_m2mf(pscreen, mt->base.bo, tx->level_offset, tx->level_pitch, tx->level_tiling, x, y, - tx->level_width, tx->level_height, - tx->bo, 0, tx->base.stride, - tx->bo->tile_mode, 0, 0, - tx->base.width, tx->base.height, - tx->base.block.size, w, h, + tx->nblocksx, tx->nblocksy, + tx->level_depth, + tx->bo, 0, + tx->base.stride, tx->bo->tile_mode, + 0, 0, + tx->nblocksx, tx->nblocksy, 1, + pf_get_blocksize(pt->format), nx, ny, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART, NOUVEAU_BO_GART); } @@ -182,18 +206,24 @@ nv50_transfer_del(struct pipe_transfer *ptx) { struct nv50_transfer *tx = (struct nv50_transfer *)ptx; struct nv50_miptree *mt = nv50_miptree(ptx->texture); + struct pipe_texture *pt = ptx->texture; + + unsigned nx = pf_get_nblocksx(pt->format, tx->base.width); + unsigned ny = pf_get_nblocksy(pt->format, tx->base.height); + + if (ptx->usage & PIPE_TRANSFER_WRITE) { + struct pipe_screen *pscreen = pt->screen; - if (ptx->usage != PIPE_TRANSFER_READ) { - struct pipe_screen *pscreen = ptx->texture->screen; - nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, tx->base.stride, - tx->bo->tile_mode, 0, 0, - tx->base.width, tx->base.height, + nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, + tx->base.stride, tx->bo->tile_mode, + 0, 0, + tx->nblocksx, tx->nblocksy, 1, mt->base.bo, tx->level_offset, tx->level_pitch, tx->level_tiling, tx->level_x, tx->level_y, - tx->level_width, tx->level_height, - tx->base.block.size, tx->base.width, - tx->base.height, + tx->nblocksx, tx->nblocksy, + tx->level_depth, + pf_get_blocksize(pt->format), nx, ny, NOUVEAU_BO_GART, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART); } @@ -237,3 +267,89 @@ nv50_transfer_init_screen_functions(struct pipe_screen *pscreen) pscreen->transfer_map = nv50_transfer_map; pscreen->transfer_unmap = nv50_transfer_unmap; } + +void +nv50_upload_sifc(struct nv50_context *nv50, + struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc, + unsigned dst_format, int dst_w, int dst_h, int dst_pitch, + void *src, unsigned src_format, int src_pitch, + int x, int y, int w, int h, int cpp) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *eng2d = nv50->screen->eng2d; + struct nouveau_grobj *tesla = nv50->screen->tesla; + unsigned line_dwords = (w * cpp + 3) / 4; + + reloc |= NOUVEAU_BO_WR; + + WAIT_RING (chan, 32); + + if (bo->tile_flags) { + BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 5); + OUT_RING (chan, dst_format); + OUT_RING (chan, 0); + OUT_RING (chan, bo->tile_mode << 4); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + } else { + BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2); + OUT_RING (chan, dst_format); + OUT_RING (chan, 1); + BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1); + OUT_RING (chan, dst_pitch); + } + + BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 4); + OUT_RING (chan, dst_w); + OUT_RING (chan, dst_h); + OUT_RELOCh(chan, bo, dst_offset, reloc); + OUT_RELOCl(chan, bo, dst_offset, reloc); + + /* NV50_2D_OPERATION_SRCCOPY assumed already set */ + + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_UNK0800, 2); + OUT_RING (chan, 0); + OUT_RING (chan, src_format); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10); + OUT_RING (chan, w); + OUT_RING (chan, h); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, x); + OUT_RING (chan, 0); + OUT_RING (chan, y); + + while (h--) { + const uint32_t *p = src; + unsigned count = line_dwords; + + while (count) { + unsigned nr = MIN2(count, 1792); + + if (chan->pushbuf->remaining <= nr) { + FIRE_RING (chan); + + BEGIN_RING(chan, eng2d, + NV50_2D_DST_ADDRESS_HIGH, 2); + OUT_RELOCh(chan, bo, dst_offset, reloc); + OUT_RELOCl(chan, bo, dst_offset, reloc); + } + assert(chan->pushbuf->remaining > nr); + + BEGIN_RING(chan, eng2d, + NV50_2D_SIFC_DATA | (2 << 29), nr); + OUT_RINGp (chan, p, nr); + + p += nr; + count -= nr; + } + + src += src_pitch; + } + + BEGIN_RING(chan, tesla, 0x1440, 1); + OUT_RING (chan, 0); +} diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index eeed148c7b..db54380241 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -26,6 +26,18 @@ #include "nv50_context.h" +static boolean +nv50_push_elements_u08(struct nv50_context *, uint8_t *, unsigned); + +static boolean +nv50_push_elements_u16(struct nv50_context *, uint16_t *, unsigned); + +static boolean +nv50_push_elements_u32(struct nv50_context *, uint32_t *, unsigned); + +static boolean +nv50_push_arrays(struct nv50_context *, unsigned, unsigned); + static INLINE unsigned nv50_prim(unsigned mode) { @@ -132,6 +144,7 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start, struct nv50_context *nv50 = nv50_context(pipe); struct nouveau_channel *chan = nv50->screen->tesla->channel; struct nouveau_grobj *tesla = nv50->screen->tesla; + boolean ret; nv50_state_validate(nv50); @@ -139,24 +152,25 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start, OUT_RING (chan, 0); BEGIN_RING(chan, tesla, 0x142c, 1); OUT_RING (chan, 0); - BEGIN_RING(chan, tesla, 0x1440, 1); - OUT_RING (chan, 0); - BEGIN_RING(chan, tesla, 0x1334, 1); - OUT_RING (chan, 0); BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1); OUT_RING (chan, nv50_prim(mode)); - BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2); - OUT_RING (chan, start); - OUT_RING (chan, count); + + if (nv50->vbo_fifo) + ret = nv50_push_arrays(nv50, start, count); + else { + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2); + OUT_RING (chan, start); + OUT_RING (chan, count); + ret = TRUE; + } BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1); OUT_RING (chan, 0); - pipe->flush(pipe, 0, NULL); - return TRUE; + return ret; } -static INLINE void +static INLINE boolean nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map, unsigned start, unsigned count) { @@ -165,6 +179,9 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map, map += start; + if (nv50->vbo_fifo) + return nv50_push_elements_u08(nv50, map, count); + if (count & 1) { BEGIN_RING(chan, tesla, 0x15e8, 1); OUT_RING (chan, map[0]); @@ -183,9 +200,10 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map, count -= nr; map += nr; } + return TRUE; } -static INLINE void +static INLINE boolean nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map, unsigned start, unsigned count) { @@ -194,6 +212,9 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map, map += start; + if (nv50->vbo_fifo) + return nv50_push_elements_u16(nv50, map, count); + if (count & 1) { BEGIN_RING(chan, tesla, 0x15e8, 1); OUT_RING (chan, map[0]); @@ -212,9 +233,10 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map, count -= nr; map += nr; } + return TRUE; } -static INLINE void +static INLINE boolean nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map, unsigned start, unsigned count) { @@ -223,6 +245,9 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map, map += start; + if (nv50->vbo_fifo) + return nv50_push_elements_u32(nv50, map, count); + while (count) { unsigned nr = count > 2047 ? 2047 : count; @@ -232,6 +257,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map, count -= nr; map += nr; } + return TRUE; } boolean @@ -244,6 +270,7 @@ nv50_draw_elements(struct pipe_context *pipe, struct nouveau_grobj *tesla = nv50->screen->tesla; struct pipe_screen *pscreen = pipe->screen; void *map; + boolean ret; map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ); @@ -258,23 +285,25 @@ nv50_draw_elements(struct pipe_context *pipe, OUT_RING (chan, nv50_prim(mode)); switch (indexSize) { case 1: - nv50_draw_elements_inline_u08(nv50, map, start, count); + ret = nv50_draw_elements_inline_u08(nv50, map, start, count); break; case 2: - nv50_draw_elements_inline_u16(nv50, map, start, count); + ret = nv50_draw_elements_inline_u16(nv50, map, start, count); break; case 4: - nv50_draw_elements_inline_u32(nv50, map, start, count); + ret = nv50_draw_elements_inline_u32(nv50, map, start, count); break; default: assert(0); + ret = FALSE; + break; } BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1); OUT_RING (chan, 0); pipe_buffer_unmap(pscreen, indexBuffer); - pipe->flush(pipe, 0, NULL); - return TRUE; + + return ret; } static INLINE boolean @@ -341,17 +370,24 @@ nv50_vbo_validate(struct nv50_context *nv50) { struct nouveau_grobj *tesla = nv50->screen->tesla; struct nouveau_stateobj *vtxbuf, *vtxfmt, *vtxattr; - unsigned i; + unsigned i, n_ve; /* don't validate if Gallium took away our buffers */ if (nv50->vtxbuf_nr == 0) return; + nv50->vbo_fifo = 0; + + for (i = 0; i < nv50->vtxbuf_nr; ++i) + if (nv50->vtxbuf[i].stride && + !(nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX)) + nv50->vbo_fifo = 0xffff; + + n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr); vtxattr = NULL; - vtxbuf = so_new(nv50->vtxelt_nr * 7, nv50->vtxelt_nr * 4); - vtxfmt = so_new(nv50->vtxelt_nr + 1, 0); - so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), - nv50->vtxelt_nr); + vtxbuf = so_new(n_ve * 7, nv50->vtxelt_nr * 4); + vtxfmt = so_new(n_ve + 1, 0); + so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve); for (i = 0; i < nv50->vtxelt_nr; i++) { struct pipe_vertex_element *ve = &nv50->vtxelt[i]; @@ -367,10 +403,19 @@ nv50_vbo_validate(struct nv50_context *nv50) so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 1); so_data (vtxbuf, 0); + + nv50->vbo_fifo &= ~(1 << i); continue; } so_data(vtxfmt, hw | i); + if (nv50->vbo_fifo) { + so_method(vtxbuf, tesla, + NV50TCL_VERTEX_ARRAY_FORMAT(i), 1); + so_data (vtxbuf, 0); + continue; + } + so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3); so_data (vtxbuf, 0x20000000 | vb->stride); so_reloc (vtxbuf, bo, vb->buffer_offset + @@ -389,6 +434,13 @@ nv50_vbo_validate(struct nv50_context *nv50) NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); } + for (; i < n_ve; ++i) { + so_data (vtxfmt, 0x7e080010); + + so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 1); + so_data (vtxbuf, 0); + } + nv50->state.vtxelt_nr = nv50->vtxelt_nr; so_ref (vtxfmt, &nv50->state.vtxfmt); so_ref (vtxbuf, &nv50->state.vtxbuf); @@ -398,3 +450,320 @@ nv50_vbo_validate(struct nv50_context *nv50) so_ref (NULL, &vtxattr); } +typedef void (*pfn_push)(struct nouveau_channel *, void *); + +struct nv50_vbo_emitctx +{ + pfn_push push[16]; + void *map[16]; + unsigned stride[16]; + unsigned nr_ve; + unsigned vtx_dwords; + unsigned vtx_max; +}; + +static INLINE void +emit_vtx_next(struct nouveau_channel *chan, struct nv50_vbo_emitctx *emit) +{ + unsigned i; + + for (i = 0; i < emit->nr_ve; ++i) { + emit->push[i](chan, emit->map[i]); + emit->map[i] += emit->stride[i]; + } +} + +static INLINE void +emit_vtx(struct nouveau_channel *chan, struct nv50_vbo_emitctx *emit, + uint32_t vi) +{ + unsigned i; + + for (i = 0; i < emit->nr_ve; ++i) + emit->push[i](chan, emit->map[i] + emit->stride[i] * vi); +} + +static INLINE boolean +nv50_map_vbufs(struct nv50_context *nv50) +{ + int i; + + for (i = 0; i < nv50->vtxbuf_nr; ++i) { + struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i]; + unsigned size, delta; + + if (nouveau_bo(vb->buffer)->map) + continue; + + size = vb->stride * (vb->max_index + 1); + delta = vb->buffer_offset; + + if (!size) + size = vb->buffer->size - vb->buffer_offset; + + if (nouveau_bo_map_range(nouveau_bo(vb->buffer), + delta, size, NOUVEAU_BO_RD)) + break; + } + + if (i == nv50->vtxbuf_nr) + return TRUE; + for (; i >= 0; --i) + nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer)); + return FALSE; +} + +static INLINE void +nv50_unmap_vbufs(struct nv50_context *nv50) +{ + unsigned i; + + for (i = 0; i < nv50->vtxbuf_nr; ++i) + if (nouveau_bo(nv50->vtxbuf[i].buffer)->map) + nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer)); +} + +static void +emit_b32_1(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); +} + +static void +emit_b32_2(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); + OUT_RING(chan, v[1]); +} + +static void +emit_b32_3(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); + OUT_RING(chan, v[1]); + OUT_RING(chan, v[2]); +} + +static void +emit_b32_4(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); + OUT_RING(chan, v[1]); + OUT_RING(chan, v[2]); + OUT_RING(chan, v[3]); +} + +static void +emit_b16_1(struct nouveau_channel *chan, void *data) +{ + uint16_t *v = data; + + OUT_RING(chan, v[0]); +} + +static void +emit_b16_3(struct nouveau_channel *chan, void *data) +{ + uint16_t *v = data; + + OUT_RING(chan, (v[1] << 16) | v[0]); + OUT_RING(chan, v[2]); +} + +static void +emit_b08_1(struct nouveau_channel *chan, void *data) +{ + uint8_t *v = data; + + OUT_RING(chan, v[0]); +} + +static void +emit_b08_3(struct nouveau_channel *chan, void *data) +{ + uint8_t *v = data; + + OUT_RING(chan, (v[2] << 16) | (v[1] << 8) | v[0]); +} + +static boolean +emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit, + unsigned start) +{ + unsigned i; + + if (nv50_map_vbufs(nv50) == FALSE) + return FALSE; + + emit->nr_ve = 0; + emit->vtx_dwords = 0; + + for (i = 0; i < nv50->vtxelt_nr; ++i) { + struct pipe_vertex_element *ve; + struct pipe_vertex_buffer *vb; + unsigned n, type, size; + + ve = &nv50->vtxelt[i]; + vb = &nv50->vtxbuf[ve->vertex_buffer_index]; + if (!(nv50->vbo_fifo & (1 << i))) + continue; + n = emit->nr_ve++; + + emit->stride[n] = vb->stride; + emit->map[n] = nouveau_bo(vb->buffer)->map + + (start * vb->stride + ve->src_offset); + + type = pf_type(ve->src_format); + size = pf_size_x(ve->src_format) << pf_exp2(ve->src_format); + + assert(ve->nr_components > 0 && ve->nr_components <= 4); + + /* It shouldn't be necessary to push the implicit 1s + * for case 3 and size 8 cases 1, 2, 3. + */ + switch (size) { + default: + NOUVEAU_ERR("unsupported vtxelt size: %u\n", size); + return FALSE; + case 32: + switch (ve->nr_components) { + case 1: emit->push[n] = emit_b32_1; break; + case 2: emit->push[n] = emit_b32_2; break; + case 3: emit->push[n] = emit_b32_3; break; + case 4: emit->push[n] = emit_b32_4; break; + } + emit->vtx_dwords += ve->nr_components; + break; + case 16: + switch (ve->nr_components) { + case 1: emit->push[n] = emit_b16_1; break; + case 2: emit->push[n] = emit_b32_1; break; + case 3: emit->push[n] = emit_b16_3; break; + case 4: emit->push[n] = emit_b32_2; break; + } + emit->vtx_dwords += (ve->nr_components + 1) >> 1; + break; + case 8: + switch (ve->nr_components) { + case 1: emit->push[n] = emit_b08_1; break; + case 2: emit->push[n] = emit_b16_1; break; + case 3: emit->push[n] = emit_b08_3; break; + case 4: emit->push[n] = emit_b32_1; break; + } + emit->vtx_dwords += 1; + break; + } + } + + emit->vtx_max = 512 / emit->vtx_dwords; + + return TRUE; +} + +static boolean +nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, start) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx_next(chan, &emit); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} + +static boolean +nv50_push_elements_u32(struct nv50_context *nv50, uint32_t *map, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, 0) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx(chan, &emit, *map++); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} + +static boolean +nv50_push_elements_u16(struct nv50_context *nv50, uint16_t *map, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, 0) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx(chan, &emit, *map++); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} + +static boolean +nv50_push_elements_u08(struct nv50_context *nv50, uint8_t *map, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, 0) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx(chan, &emit, *map++); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} |