summaryrefslogtreecommitdiff
path: root/src/mesa/tnl/t_vertex_sse.c
diff options
context:
space:
mode:
authorKeith Whitwell <keith@tungstengraphics.com>2005-05-18 12:26:21 +0000
committerKeith Whitwell <keith@tungstengraphics.com>2005-05-18 12:26:21 +0000
commitdd4c1dd0382277b080fb4981e027250e10658ae8 (patch)
tree84531301cf6e155caa7c49c13864b4b90b261162 /src/mesa/tnl/t_vertex_sse.c
parentb745bf08cd5e772f86360267995a96e9b73384b0 (diff)
Generates working SSE code for gears under the swrast driver.
Diffstat (limited to 'src/mesa/tnl/t_vertex_sse.c')
-rw-r--r--src/mesa/tnl/t_vertex_sse.c185
1 files changed, 127 insertions, 58 deletions
diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c
index b4e2c5b474..1771baab15 100644
--- a/src/mesa/tnl/t_vertex_sse.c
+++ b/src/mesa/tnl/t_vertex_sse.c
@@ -42,6 +42,8 @@
#define Z 2
#define W 3
+#define DISASSEM 1
+
struct x86_reg {
GLuint file:3;
GLuint idx:3;
@@ -144,6 +146,17 @@ static struct x86_reg make_disp( struct x86_reg reg,
return reg;
}
+static struct x86_reg deref( struct x86_reg reg )
+{
+ return make_disp(reg, 0);
+}
+
+static struct x86_reg get_base_reg( struct x86_reg reg )
+{
+ return make_reg( reg.file, reg.idx );
+}
+
+
/* Retreive a reference to one of the function arguments, taking into
* account any push/pop activity:
*/
@@ -179,29 +192,47 @@ static void emit_1b( struct x86_program *p, GLbyte b0 )
*(GLbyte *)(p->csr++) = b0;
}
-static void emit_1ub( struct x86_program *p, GLubyte b0 )
+static void emit_1i( struct x86_program *p, GLint i0 )
+{
+ *(GLint *)(p->csr) = i0;
+ p->csr += 4;
+}
+
+static void disassem( struct x86_program *p, const char *fn )
+{
+#if DISASSEM
+ static const char *last_fn;
+ if (fn && fn != last_fn) {
+ _mesa_printf("0x%x: %s\n", p->csr, fn);
+ last_fn = fn;
+ }
+#endif
+}
+
+static void emit_1ub_fn( struct x86_program *p, GLubyte b0, const char *fn )
{
+ disassem(p, fn);
*(p->csr++) = b0;
}
-static void emit_2ub( struct x86_program *p, GLubyte b0, GLubyte b1 )
+static void emit_2ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, const char *fn )
{
+ disassem(p, fn);
*(p->csr++) = b0;
*(p->csr++) = b1;
}
-static void emit_3ub( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2 )
+static void emit_3ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2, const char *fn )
{
+ disassem(p, fn);
*(p->csr++) = b0;
*(p->csr++) = b1;
*(p->csr++) = b2;
}
-static void emit_1i( struct x86_program *p, GLint i0 )
-{
- *(GLint *)(p->csr) = i0;
- p->csr += 4;
-}
+#define emit_1ub(p, b0) emit_1ub_fn(p, b0, __FUNCTION__)
+#define emit_2ub(p, b0, b1) emit_2ub_fn(p, b0, b1, __FUNCTION__)
+#define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__)
/* Labels, jumps and fixup:
@@ -216,7 +247,7 @@ static void emit_jcc( struct x86_program *p,
GLubyte *label )
{
GLint offset = label - (get_label(p) + 2);
-
+
if (offset <= 127 && offset >= -128) {
emit_1ub(p, 0x70 + cc);
emit_1b(p, (GLbyte) offset);
@@ -273,7 +304,7 @@ static void emit_dec( struct x86_program *p,
struct x86_reg reg )
{
assert(reg.mod == mod_REG);
- emit_1ub(p, 0x40 + reg.idx);
+ emit_1ub(p, 0x48 + reg.idx);
}
static void emit_ret( struct x86_program *p )
@@ -299,7 +330,13 @@ static void emit_modrm( struct x86_program *p,
val |= reg.idx << 3; /* reg field */
val |= regmem.idx; /* r/m field */
- emit_1ub(p, val);
+ emit_1ub_fn(p, val, 0);
+
+ /* Oh-oh we've stumbled into the SIB thing.
+ */
+ if (regmem.idx == reg_SP) {
+ emit_1ub_fn(p, 0x24, 0); /* simplistic! */
+ }
switch (regmem.mod) {
case mod_REG:
@@ -307,8 +344,10 @@ static void emit_modrm( struct x86_program *p,
break;
case mod_DISP8:
emit_1b(p, regmem.disp);
+ break;
case mod_DISP32:
emit_1i(p, regmem.disp);
+ break;
}
}
@@ -325,14 +364,14 @@ static void emit_op_modrm( struct x86_program *p,
{
switch (dst.mod) {
case mod_REG:
- emit_1ub(p, op_dst_is_reg);
+ emit_1ub_fn(p, op_dst_is_reg, 0);
emit_modrm(p, dst, src);
break;
case mod_INDIRECT:
case mod_DISP32:
case mod_DISP8:
assert(src.mod == mod_REG);
- emit_1ub(p, op_dst_is_mem);
+ emit_1ub_fn(p, op_dst_is_mem, 0);
emit_modrm(p, src, dst);
break;
}
@@ -352,6 +391,13 @@ static void emit_xor( struct x86_program *p,
emit_op_modrm( p, 0x33, 0x31, dst, src );
}
+static void emit_cmp( struct x86_program *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm( p, 0x3b, 0x39, dst, src );
+}
+
static void emit_movlps( struct x86_program *p,
struct x86_reg dst,
struct x86_reg src )
@@ -443,6 +489,14 @@ static void emit_packsswb( struct x86_program *p,
emit_modrm( p, dst, src );
}
+static void emit_packuswb( struct x86_program *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0x66, X86_TWOB, 0x67);
+ emit_modrm( p, dst, src );
+}
+
/* Load effective address:
*/
static void emit_lea( struct x86_program *p,
@@ -461,6 +515,14 @@ static void emit_add_imm( struct x86_program *p,
emit_lea(p, dst, make_disp(src, value));
}
+static void emit_test( struct x86_program *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_1ub(p, 0x85);
+ emit_modrm( p, dst, src );
+}
+
@@ -487,7 +549,7 @@ static void emit_pk4ub( struct x86_program *p,
{
emit_cvtps2dq(p, dest, arg0);
emit_packssdw(p, dest, dest);
- emit_packsswb(p, dest, dest);
+ emit_packuswb(p, dest, dest);
}
static void emit_load4f_4( struct x86_program *p,
@@ -620,12 +682,12 @@ static void (*load[4][4])( struct x86_program *p,
};
static void emit_load( struct x86_program *p,
- struct x86_reg temp,
+ struct x86_reg dest,
GLuint sz,
struct x86_reg src,
GLuint src_sz)
{
- load[sz-1][src_sz-1](p, temp, src);
+ load[sz-1][src_sz-1](p, dest, src);
}
@@ -721,6 +783,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
struct x86_reg tmp = make_reg(file_XMM, 0);
struct x86_reg vp0 = make_reg(file_XMM, 1);
struct x86_reg vp1 = make_reg(file_XMM, 2);
+ struct x86_reg chan0 = make_reg(file_XMM, 3);
GLubyte *fixup, *label;
p->csr = p->store;
@@ -731,6 +794,15 @@ static GLboolean build_vertex_emit( struct x86_program *p )
emit_push(p, countEBP);
emit_push(p, vtxESI);
+
+ /* Get vertex count, compare to zero
+ */
+ emit_xor(p, srcEDI, srcEDI);
+ emit_mov(p, countEBP, make_fn_arg(p, 2));
+ emit_cmp(p, countEBP, srcEDI);
+ fixup = emit_jcc_forward(p, cc_E);
+
+
/* Initialize destination register.
*/
emit_mov(p, vertexEAX, make_fn_arg(p, 3));
@@ -741,10 +813,6 @@ static GLboolean build_vertex_emit( struct x86_program *p )
emit_mov(p, vtxESI, make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
vtxESI = make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
- /* Get vertex count, compare to zero
- */
- emit_mov(p, countEBP, make_fn_arg(p, 2));
- fixup = emit_jcc_forward(p, cc_NZ);
/* Possibly load vp0, vp1 for viewport calcs:
*/
@@ -753,6 +821,10 @@ static GLboolean build_vertex_emit( struct x86_program *p )
emit_movups(p, vp1, make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
}
+ /* always load, needed or not:
+ */
+ emit_movups(p, chan0, make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
+
/* Note address for loop jump */
label = get_label(p);
@@ -775,40 +847,40 @@ static GLboolean build_vertex_emit( struct x86_program *p )
*/
switch (a[j].format) {
case EMIT_1F:
- emit_load(p, tmp, 1, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 1, deref(srcEDI), vtx->attr[j].inputsize);
emit_store(p, dest, 1, tmp);
case EMIT_2F:
- emit_load(p, tmp, 2, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize);
emit_store(p, dest, 2, tmp);
case EMIT_3F:
/* Potentially the worst case - hardcode 2+1 copying:
*/
- emit_load(p, tmp, 3, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize);
emit_store(p, dest, 3, tmp);
case EMIT_4F:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
emit_store(p, dest, 4, tmp);
break;
case EMIT_2F_VIEWPORT:
- emit_load(p, tmp, 2, srcEDI, vtx->attr[j].inputsize);
- emit_mulps(p, dest, vp0);
- emit_addps(p, dest, vp1);
+ emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize);
+ emit_mulps(p, tmp, vp0);
+ emit_addps(p, tmp, vp1);
emit_store(p, dest, 2, tmp);
break;
case EMIT_3F_VIEWPORT:
- emit_load(p, tmp, 3, srcEDI, vtx->attr[j].inputsize);
- emit_mulps(p, dest, vp0);
- emit_addps(p, dest, vp1);
+ emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize);
+ emit_mulps(p, tmp, vp0);
+ emit_addps(p, tmp, vp1);
emit_store(p, dest, 3, tmp);
break;
case EMIT_4F_VIEWPORT:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
- emit_mulps(p, dest, vp0);
- emit_addps(p, dest, vp1);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
+ emit_mulps(p, tmp, vp0);
+ emit_addps(p, tmp, vp1);
emit_store(p, dest, 4, tmp);
break;
case EMIT_3F_XYW:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
emit_pshufd(p, tmp, tmp, X, Y, W, Z);
emit_store(p, dest, 3, tmp);
break;
@@ -818,48 +890,56 @@ static GLboolean build_vertex_emit( struct x86_program *p )
case EMIT_1UB_1F:
case EMIT_3UB_3F_RGB:
case EMIT_3UB_3F_BGR:
+ _mesa_printf("non-implemneted format %d\n", a[j].format);
return GL_FALSE; /* add this later */
case EMIT_4UB_4F_RGBA:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
+ emit_mulps(p, tmp, chan0);
emit_pk4ub(p, tmp, tmp);
emit_store(p, dest, 1, tmp);
break;
case EMIT_4UB_4F_BGRA:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
emit_pshufd(p, tmp, tmp, Z, Y, X, W);
+ emit_mulps(p, tmp, chan0);
emit_pk4ub(p, tmp, tmp);
emit_store(p, dest, 1, tmp);
break;
case EMIT_4UB_4F_ARGB:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
emit_pshufd(p, tmp, tmp, W, X, Y, Z);
+ emit_mulps(p, tmp, chan0);
emit_pk4ub(p, tmp, tmp);
emit_store(p, dest, 1, tmp);
break;
case EMIT_4UB_4F_ABGR:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
emit_pshufd(p, tmp, tmp, W, Z, Y, X);
+ emit_mulps(p, tmp, chan0);
emit_pk4ub(p, tmp, tmp);
emit_store(p, dest, 1, tmp);
break;
case EMIT_4CHAN_4F_RGBA:
switch (CHAN_TYPE) {
case GL_UNSIGNED_BYTE:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
+ emit_mulps(p, tmp, chan0);
emit_pk4ub(p, tmp, tmp);
emit_store(p, dest, 1, tmp);
break;
- case GL_UNSIGNED_SHORT:
- return GL_FALSE;
case GL_FLOAT:
- emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize);
+ emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);
emit_store(p, dest, 4, tmp);
break;
+ case GL_UNSIGNED_SHORT:
default:
- break;
+ _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
+ return GL_FALSE;
}
+ break;
default:
+ _mesa_printf("unknown a[%d].format %d\n", j, a[j].format);
return GL_FALSE; /* catch any new opcodes */
}
@@ -881,6 +961,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
/* decr count, loop if not zero
*/
emit_dec(p, countEBP);
+ emit_test(p, countEBP, countEBP);
emit_jcc(p, cc_NZ, label);
/* Land forward jump here:
@@ -889,7 +970,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
/* Pop regs and return
*/
- emit_pop(p, vtxESI);
+ emit_pop(p, get_base_reg(vtxESI));
emit_pop(p, countEBP);
emit_pop(p, srcEDI);
emit_ret(p);
@@ -912,20 +993,8 @@ void _tnl_generate_sse_emit( GLcontext *ctx )
if (build_vertex_emit(&p)) {
_tnl_register_fastpath( vtx, GL_TRUE );
-
- {
- static int i = 0;
- char filename[100];
- int fd;
-
- sprintf(filename, "fastpath%d.o", i);
- fd = creat(filename, 0600);
- if (fd != -1) {
- write(fd, p.store, p.csr - p.store);
- close(fd);
- _mesa_printf("wrote %s\n", filename);
- }
- }
+ if (DISASSEM)
+ _mesa_printf("disassemble 0x%x 0x%x\n", p.store, p.csr);
}
else {
FREE(p.store);