summaryrefslogtreecommitdiff
path: root/src/mesa/pipe/draw
diff options
context:
space:
mode:
Diffstat (limited to 'src/mesa/pipe/draw')
-rw-r--r--src/mesa/pipe/draw/Makefile2
-rw-r--r--src/mesa/pipe/draw/draw_clip.c136
-rw-r--r--src/mesa/pipe/draw/draw_context.c9
-rw-r--r--src/mesa/pipe/draw/draw_prim.c185
-rw-r--r--src/mesa/pipe/draw/draw_private.h16
-rw-r--r--src/mesa/pipe/draw/draw_validate.c10
-rw-r--r--src/mesa/pipe/draw/draw_vbuf.c233
-rw-r--r--src/mesa/pipe/draw/draw_vertex_fetch.c378
-rw-r--r--src/mesa/pipe/draw/draw_vertex_shader.c44
-rw-r--r--src/mesa/pipe/draw/draw_vertex_shader_llvm.c4
-rw-r--r--src/mesa/pipe/draw/draw_vf.c428
-rw-r--r--src/mesa/pipe/draw/draw_vf.h223
-rw-r--r--src/mesa/pipe/draw/draw_vf_generic.c585
-rw-r--r--src/mesa/pipe/draw/draw_vf_sse.c614
14 files changed, 2639 insertions, 228 deletions
diff --git a/src/mesa/pipe/draw/Makefile b/src/mesa/pipe/draw/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/mesa/pipe/draw/Makefile
@@ -0,0 +1,2 @@
+default:
+ cd .. ; make
diff --git a/src/mesa/pipe/draw/draw_clip.c b/src/mesa/pipe/draw/draw_clip.c
index 2d410e3244..61130c5600 100644
--- a/src/mesa/pipe/draw/draw_clip.c
+++ b/src/mesa/pipe/draw/draw_clip.c
@@ -33,6 +33,8 @@
#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
#include "draw_context.h"
#include "draw_private.h"
@@ -54,6 +56,12 @@
struct clipper {
struct draw_stage stage; /**< base class */
+ /* Basically duplicate some of the flatshading logic here:
+ */
+ boolean flat;
+ uint num_color_attribs;
+ uint color_attribs[4]; /* front/back primary/secondary colors */
+
float (*plane)[4];
};
@@ -82,6 +90,17 @@ static void interp_attr( float *fdst,
fdst[3] = LINTERP( t, fout[3], fin[3] );
}
+static void copy_colors( struct draw_stage *stage,
+ struct vertex_header *dst,
+ const struct vertex_header *src )
+{
+ const struct clipper *clipper = clipper_stage(stage);
+ uint i;
+ for (i = 0; i < clipper->num_color_attribs; i++) {
+ const uint attr = clipper->color_attribs[i];
+ COPY_4FV(dst->data[attr], src->data[attr]);
+ }
+}
@@ -134,27 +153,11 @@ static void interp( const struct clipper *clip,
}
}
-#if 0
-static INLINE void do_tri( struct draw_stage *next,
- struct prim_header *header )
-{
- unsigned i;
- for (i = 0; i < 3; i++) {
- float *ndc = header->v[i]->data[0];
- _mesa_printf("ndc %f %f %f\n", ndc[0], ndc[1], ndc[2]);
- assert(ndc[0] >= -1 && ndc[0] <= 641);
- assert(ndc[1] >= 30 && ndc[1] <= 481);
- }
- _mesa_printf("\n");
- next->tri(next, header);
-}
-#endif
-
static void emit_poly( struct draw_stage *stage,
struct vertex_header **inlist,
unsigned n,
- const struct prim_header *origPrim)
+ const struct prim_header *origPrim)
{
struct prim_header header;
unsigned i;
@@ -163,16 +166,16 @@ static void emit_poly( struct draw_stage *stage,
header.det = origPrim->det;
for (i = 2; i < n; i++) {
- header.v[0] = inlist[0];
- header.v[1] = inlist[i-1];
- header.v[2] = inlist[i];
+ header.v[0] = inlist[i-1];
+ header.v[1] = inlist[i];
+ header.v[2] = inlist[0]; /* keep in v[2] for flatshading */
{
- unsigned tmp0 = header.v[0]->edgeflag;
+ unsigned tmp1 = header.v[1]->edgeflag;
unsigned tmp2 = header.v[2]->edgeflag;
- if (i != 2) header.v[0]->edgeflag = 0;
- if (i != n-1) header.v[2]->edgeflag = 0;
+ if (i != n-1) header.v[1]->edgeflag = 0;
+ if (i != 2) header.v[2]->edgeflag = 0;
header.edgeflags = ((header.v[0]->edgeflag << 0) |
(header.v[1]->edgeflag << 1) |
@@ -180,27 +183,13 @@ static void emit_poly( struct draw_stage *stage,
stage->next->tri( stage->next, &header );
- header.v[0]->edgeflag = tmp0;
+ header.v[1]->edgeflag = tmp1;
header.v[2]->edgeflag = tmp2;
}
}
}
-#if 0
-static void emit_poly( struct draw_stage *stage )
-{
- unsigned i;
-
- for (i = 2; i < n; i++) {
- header->v[0] = inlist[0];
- header->v[1] = inlist[i-1];
- header->v[2] = inlist[i];
-
- stage->next->tri( stage->next, header );
- }
-}
-#endif
/* Clip a triangle against the viewport and user clip planes.
@@ -281,6 +270,18 @@ do_clip_tri( struct draw_stage *stage,
}
}
+ /* If flat-shading, copy color to new provoking vertex.
+ */
+ if (clipper->flat && inlist[0] != header->v[2]) {
+ if (1) {
+ inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+ }
+
+ copy_colors(stage, inlist[0], header->v[2]);
+ }
+
+
+
/* Emit the polygon as triangles to the setup stage:
*/
if (n >= 3)
@@ -328,6 +329,10 @@ do_clip_line( struct draw_stage *stage,
if (v0->clipmask) {
interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+ if (clipper->flat)
+ copy_colors(stage, stage->tmp[0], v0);
+
newprim.v[0] = stage->tmp[0];
}
else {
@@ -393,8 +398,55 @@ clip_tri( struct draw_stage *stage,
}
}
-static void clip_flush( struct draw_stage *stage, unsigned flags )
+/* Update state. Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void
+clip_init_state( struct draw_stage *stage )
+{
+ struct clipper *clipper = clipper_stage( stage );
+
+ clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
+
+ if (clipper->flat) {
+ const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+ uint i;
+
+ clipper->num_color_attribs = 0;
+ for (i = 0; i < vs->num_outputs; i++) {
+ if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+ vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+ clipper->color_attribs[clipper->num_color_attribs++] = i;
+ }
+ }
+ }
+
+ stage->tri = clip_tri;
+ stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+ struct prim_header *header )
+{
+ clip_init_state( stage );
+ stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+ struct prim_header *header )
+{
+ clip_init_state( stage );
+ stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage,
+ unsigned flags )
{
+ stage->tri = clip_first_tri;
+ stage->line = clip_first_line;
stage->next->flush( stage->next, flags );
}
@@ -420,12 +472,12 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
{
struct clipper *clipper = CALLOC_STRUCT(clipper);
- draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES );
+ draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
clipper->stage.draw = draw;
clipper->stage.point = clip_point;
- clipper->stage.line = clip_line;
- clipper->stage.tri = clip_tri;
+ clipper->stage.line = clip_first_line;
+ clipper->stage.tri = clip_first_tri;
clipper->stage.flush = clip_flush;
clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
clipper->stage.destroy = clip_destroy;
diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index e8ca1f035b..b15f57c824 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -71,12 +71,15 @@ struct draw_context *draw_create( void )
*/
{
uint i;
- char *tmp = (char*) MALLOC( Elements(draw->vcache.vertex) * MAX_VERTEX_SIZE );
+ const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+ char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
for (i = 0; i < Elements(draw->vcache.vertex); i++)
- draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * MAX_VERTEX_SIZE);
+ draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
}
+ draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+
draw->convert_wide_points = TRUE;
draw->convert_wide_lines = TRUE;
@@ -103,7 +106,7 @@ void draw_destroy( struct draw_context *draw )
if (draw->pipeline.rasterize)
draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
tgsi_exec_machine_free_data(&draw->machine);
- FREE( draw->vcache.vertex[0] ); /* Frees all the vertices. */
+ align_free( draw->vcache.vertex[0] ); /* Frees all the vertices. */
FREE( draw );
}
diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 243381aec0..51e2242719 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -30,6 +30,8 @@
* Keith Whitwell <keith@tungstengraphics.com>
*/
+#include "pipe/p_debug.h"
+
#include "draw_private.h"
#include "draw_context.h"
@@ -60,38 +62,55 @@ static void draw_prim_queue_flush( struct draw_context *draw )
unsigned i;
if (0)
- fprintf(stdout,"Flushing with %d prims, %d verts\n",
- draw->pq.queue_nr, draw->vs.queue_nr);
+ debug_printf("Flushing with %d prims, %d verts\n",
+ draw->pq.queue_nr, draw->vs.queue_nr);
- if (draw->pq.queue_nr == 0)
- return;
+ assert (draw->pq.queue_nr != 0);
/* NOTE: we cannot save draw->pipeline->first in a local var because
* draw->pipeline->first is often changed by the first call to tri(),
* line(), etc.
*/
- switch (draw->reduced_prim) {
- case RP_TRI:
- for (i = 0; i < draw->pq.queue_nr; i++) {
- if (draw->pq.queue[i].reset_line_stipple)
- draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
- draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+ if (draw->rasterizer->line_stipple_enable) {
+ switch (draw->reduced_prim) {
+ case RP_TRI:
+ for (i = 0; i < draw->pq.queue_nr; i++) {
+ if (draw->pq.queue[i].reset_line_stipple)
+ draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+
+ draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+ }
+ break;
+ case RP_LINE:
+ for (i = 0; i < draw->pq.queue_nr; i++) {
+ if (draw->pq.queue[i].reset_line_stipple)
+ draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+
+ draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+ }
+ break;
+ case RP_POINT:
+ draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+ for (i = 0; i < draw->pq.queue_nr; i++)
+ draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+ break;
}
- break;
- case RP_LINE:
- for (i = 0; i < draw->pq.queue_nr; i++) {
- if (draw->pq.queue[i].reset_line_stipple)
- draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
- draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+ }
+ else {
+ switch (draw->reduced_prim) {
+ case RP_TRI:
+ for (i = 0; i < draw->pq.queue_nr; i++)
+ draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+ break;
+ case RP_LINE:
+ for (i = 0; i < draw->pq.queue_nr; i++)
+ draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+ break;
+ case RP_POINT:
+ for (i = 0; i < draw->pq.queue_nr; i++)
+ draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+ break;
}
- break;
- case RP_POINT:
- draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
- for (i = 0; i < draw->pq.queue_nr; i++)
- draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
- break;
}
draw->pq.queue_nr = 0;
@@ -103,16 +122,18 @@ static void draw_prim_queue_flush( struct draw_context *draw )
void draw_do_flush( struct draw_context *draw, unsigned flags )
{
if (0)
- fprintf(stdout,"Flushing with %d verts, %d prims\n",
- draw->vs.queue_nr,
- draw->pq.queue_nr );
+ debug_printf("Flushing with %d verts, %d prims\n",
+ draw->vs.queue_nr,
+ draw->pq.queue_nr );
if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
- draw_vertex_shader_queue_flush(draw);
+ if (draw->vs.queue_nr)
+ (*draw->shader_queue_flush)(draw);
if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
- draw_prim_queue_flush(draw);
+ if (draw->pq.queue_nr)
+ draw_prim_queue_flush(draw);
if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
draw_vertex_cache_invalidate(draw);
@@ -138,11 +159,11 @@ static struct prim_header *get_queued_prim( struct draw_context *draw,
unsigned nr_verts )
{
if (!draw_vertex_cache_check_space( draw, nr_verts )) {
-// fprintf(stderr, "v");
+// debug_printf("v");
draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
}
else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
-// fprintf(stderr, "p");
+// debug_printf("p");
draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
}
@@ -230,7 +251,7 @@ static void do_ef_triangle( struct draw_context *draw,
}
-static void do_quad( struct draw_context *draw,
+static void do_ef_quad( struct draw_context *draw,
unsigned v0,
unsigned v1,
unsigned v2,
@@ -242,6 +263,16 @@ static void do_quad( struct draw_context *draw,
do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
}
+static void do_quad( struct draw_context *draw,
+ unsigned v0,
+ unsigned v1,
+ unsigned v2,
+ unsigned v3 )
+{
+ do_triangle( draw, v0, v1, v3 );
+ do_triangle( draw, v1, v2, v3 );
+}
+
/**
* Main entrypoint to draw some number of points/lines/triangles
@@ -251,8 +282,10 @@ draw_prim( struct draw_context *draw,
unsigned prim, unsigned start, unsigned count )
{
unsigned i;
+ boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+ draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
-// _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
+// debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
switch (prim) {
case PIPE_PRIM_POINTS:
@@ -288,24 +321,32 @@ draw_prim( struct draw_context *draw,
break;
case PIPE_PRIM_LINE_STRIP:
- if (count >= 2) {
- for (i = 1; i < count; i++) {
- do_line( draw,
- i == 1,
- start + i - 1,
- start + i );
- }
+ for (i = 1; i < count; i++) {
+ do_line( draw,
+ i == 1,
+ start + i - 1,
+ start + i );
}
break;
case PIPE_PRIM_TRIANGLES:
- for (i = 0; i+2 < count; i += 3) {
- do_ef_triangle( draw,
- 1,
- ~0,
+ if (unfilled) {
+ for (i = 0; i+2 < count; i += 3) {
+ do_ef_triangle( draw,
+ 1,
+ ~0,
+ start + i + 0,
+ start + i + 1,
+ start + i + 2 );
+ }
+ }
+ else {
+ for (i = 0; i+2 < count; i += 3) {
+ do_triangle( draw,
start + i + 0,
start + i + 1,
start + i + 2 );
+ }
}
break;
@@ -339,27 +380,49 @@ draw_prim( struct draw_context *draw,
case PIPE_PRIM_QUADS:
- for (i = 0; i+3 < count; i += 4) {
- do_quad( draw,
- start + i + 0,
- start + i + 1,
- start + i + 2,
- start + i + 3);
+ if (unfilled) {
+ for (i = 0; i+3 < count; i += 4) {
+ do_ef_quad( draw,
+ start + i + 0,
+ start + i + 1,
+ start + i + 2,
+ start + i + 3);
+ }
+ }
+ else {
+ for (i = 0; i+3 < count; i += 4) {
+ do_quad( draw,
+ start + i + 0,
+ start + i + 1,
+ start + i + 2,
+ start + i + 3);
+ }
}
break;
case PIPE_PRIM_QUAD_STRIP:
- for (i = 0; i+3 < count; i += 2) {
- do_quad( draw,
- start + i + 2,
- start + i + 0,
- start + i + 1,
- start + i + 3);
+ if (unfilled) {
+ for (i = 0; i+3 < count; i += 2) {
+ do_ef_quad( draw,
+ start + i + 2,
+ start + i + 0,
+ start + i + 1,
+ start + i + 3);
+ }
+ }
+ else {
+ for (i = 0; i+3 < count; i += 2) {
+ do_quad( draw,
+ start + i + 2,
+ start + i + 0,
+ start + i + 1,
+ start + i + 3);
+ }
}
break;
case PIPE_PRIM_POLYGON:
- if (count >= 3) {
+ if (unfilled) {
unsigned ef_mask = (1<<2) | (1<<0);
for (i = 0; i+2 < count; i++) {
@@ -377,6 +440,14 @@ draw_prim( struct draw_context *draw,
ef_mask &= ~(1<<2);
}
}
+ else {
+ for (i = 0; i+2 < count; i++) {
+ do_triangle( draw,
+ start + i + 1,
+ start + i + 2,
+ start + 0);
+ }
+ }
break;
default:
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 1e59f5bd8d..7782db0477 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -141,6 +141,10 @@ struct draw_vertex_shader {
/* Internal function for vertex fetch.
*/
typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*full_fetch_func)( struct draw_context *draw,
+ struct tgsi_exec_machine *machine,
+ const unsigned *elts,
+ unsigned count );
@@ -210,6 +214,7 @@ struct draw_context
unsigned pitch[PIPE_ATTRIB_MAX];
fetch_func fetch[PIPE_ATTRIB_MAX];
unsigned nr_attrs;
+ full_fetch_func fetch_func;
} vertex_fetch;
/* Post-tnl vertex cache:
@@ -235,6 +240,11 @@ struct draw_context
unsigned queue_nr;
} vs;
+ /**
+ * Run the vertex shader on all vertices in the vertex queue.
+ */
+ void (*shader_queue_flush)(struct draw_context *draw);
+
/* Prim pipeline queue:
*/
struct {
@@ -249,6 +259,8 @@ struct draw_context
#ifdef MESA_LLVM
struct gallivm_cpu_engine *engine;
#endif
+
+ void *driver_private;
};
@@ -287,10 +299,6 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
struct tgsi_exec_machine;
extern void draw_update_vertex_fetch( struct draw_context *draw );
-extern void draw_vertex_fetch( struct draw_context *draw,
- struct tgsi_exec_machine *machine,
- const unsigned *elts,
- unsigned count );
#define DRAW_FLUSH_SHADER_QUEUE 0x1 /* sized not to overflow, never raised */
diff --git a/src/mesa/pipe/draw/draw_validate.c b/src/mesa/pipe/draw/draw_validate.c
index 86d5a5f814..4375ebabbc 100644
--- a/src/mesa/pipe/draw/draw_validate.c
+++ b/src/mesa/pipe/draw/draw_validate.c
@@ -78,6 +78,11 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
precalc_flat = 1; /* only needed for triangles really */
need_det = 1;
}
+
+ if (draw->rasterizer->flatshade && precalc_flat) {
+ draw->pipeline.flatshade->next = next;
+ next = draw->pipeline.flatshade;
+ }
if (draw->rasterizer->offset_cw ||
draw->rasterizer->offset_ccw) {
@@ -110,13 +115,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
{
draw->pipeline.clip->next = next;
next = draw->pipeline.clip;
- precalc_flat = 1; /* XXX: FIX ME! Only needed for clipped prims */
}
- if (draw->rasterizer->flatshade && precalc_flat) {
- draw->pipeline.flatshade->next = next;
- next = draw->pipeline.flatshade;
- }
draw->pipeline.first = next;
return next;
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 1e260c6156..be96c8fdeb 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -34,13 +34,14 @@
*/
-#include <assert.h>
-
-#include "pipe/draw/draw_vbuf.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
+#include "pipe/p_debug.h"
#include "pipe/p_util.h"
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_vf.h"
+
/**
* Vertex buffer emit stage.
@@ -55,6 +56,8 @@ struct vbuf_stage {
/** Vertex size in bytes */
unsigned vertex_size;
+ struct draw_vertex_fetch *vf;
+
/* FIXME: we have no guarantee that 'unsigned' is 32bit */
/** Vertices in hardware format */
@@ -110,88 +113,175 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
}
-/**
- * Extract the needed fields from post-transformed vertex and emit
- * a hardware(driver) vertex.
- * Recall that the vertices are constructed by the 'draw' module and
- * have a couple of slots at the beginning (1-dword header, 4-dword
- * clip pos) that we ignore here. We only use the vertex->data[] fields.
- */
-static INLINE void
-emit_vertex( struct vbuf_stage *vbuf,
- struct vertex_header *vertex )
+#if 0
+static INLINE void
+dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
{
- const struct vertex_info *vinfo = vbuf->vinfo;
-
- uint i;
- uint count = 0; /* for debug/sanity */
-
assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
-
-// fprintf(stderr, "emit vertex %d to %p\n",
-// vbuf->nr_vertices, vbuf->vertex_ptr);
-
- if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
- if(vertex->vertex_id < vbuf->nr_vertices)
- return;
- else
- fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n",
- vertex->vertex_id, vbuf->nr_vertices);
- return;
- }
-
- vertex->vertex_id = vbuf->nr_vertices++;
+ unsigned i, j, k;
for (i = 0; i < vinfo->num_attribs; i++) {
- uint j = vinfo->src_index[i];
+ j = vinfo->src_index[i];
switch (vinfo->emit[i]) {
case EMIT_OMIT:
- /* no-op */
+ debug_printf("EMIT_OMIT:");
break;
case EMIT_ALL:
- /* just copy the whole vertex as-is to the vbuf */
assert(i == 0);
- memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
- vbuf->vertex_ptr += vinfo->size;
- return;
+ assert(j == 0);
+ debug_printf("EMIT_ALL:\t");
+ for(k = 0; k < vinfo->size*4; ++k)
+ debug_printf("%02x ", *data++);
+ break;
case EMIT_1F:
- *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
- count++;
+ debug_printf("EMIT_1F:\t");
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
break;
case EMIT_1F_PSIZE:
- *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
- count++;
+ debug_printf("EMIT_1F_PSIZE:\t");
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
break;
case EMIT_2F:
- *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
- *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
- count += 2;
+ debug_printf("EMIT_2F:\t");
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
break;
case EMIT_3F:
- *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
- *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
- *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
- count += 3;
+ debug_printf("EMIT_3F:\t");
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
+ data += sizeof(float);
break;
case EMIT_4F:
- *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
- *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
- *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
- *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
- count += 4;
+ debug_printf("EMIT_4F:\t");
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
+ debug_printf("%f ", *(float *)data); data += sizeof(float);
break;
case EMIT_4UB:
- *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
- float_to_ubyte( vertex->data[j][1] ),
- float_to_ubyte( vertex->data[j][0] ),
- float_to_ubyte( vertex->data[j][3] ));
- count += 1;
+ debug_printf("EMIT_4UB:\t");
+ debug_printf("%u ", *data++);
+ debug_printf("%u ", *data++);
+ debug_printf("%u ", *data++);
+ debug_printf("%u ", *data++);
break;
default:
assert(0);
}
+ debug_printf("\n");
+ }
+ debug_printf("\n");
+}
+#endif
+
+
+/**
+ * Extract the needed fields from post-transformed vertex and emit
+ * a hardware(driver) vertex.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here. We only use the vertex->data[] fields.
+ */
+static INLINE void
+emit_vertex( struct vbuf_stage *vbuf,
+ struct vertex_header *vertex )
+{
+#if 0
+ debug_printf("emit vertex %d to %p\n",
+ vbuf->nr_vertices, vbuf->vertex_ptr);
+#endif
+
+ if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+ if(vertex->vertex_id < vbuf->nr_vertices)
+ return;
+ else
+ debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n",
+ vertex->vertex_id, vbuf->nr_vertices);
+ return;
+ }
+
+ vertex->vertex_id = vbuf->nr_vertices++;
+
+ if(!vbuf->vf) {
+ const struct vertex_info *vinfo = vbuf->vinfo;
+ uint i;
+ uint count = 0; /* for debug/sanity */
+
+ assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+
+ for (i = 0; i < vinfo->num_attribs; i++) {
+ uint j = vinfo->src_index[i];
+ switch (vinfo->emit[i]) {
+ case EMIT_OMIT:
+ /* no-op */
+ break;
+ case EMIT_ALL:
+ /* just copy the whole vertex as-is to the vbuf */
+ assert(i == 0);
+ assert(j == 0);
+ memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
+ vbuf->vertex_ptr += vinfo->size;
+ count += vinfo->size;
+ break;
+ case EMIT_1F:
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+ count++;
+ break;
+ case EMIT_1F_PSIZE:
+ *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
+ count++;
+ break;
+ case EMIT_2F:
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+ count += 2;
+ break;
+ case EMIT_3F:
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+ count += 3;
+ break;
+ case EMIT_4F:
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+ *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
+ count += 4;
+ break;
+ case EMIT_4UB:
+ *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+ float_to_ubyte( vertex->data[j][1] ),
+ float_to_ubyte( vertex->data[j][0] ),
+ float_to_ubyte( vertex->data[j][3] ));
+ count += 1;
+ break;
+ default:
+ assert(0);
+ }
+ }
+ assert(count == vinfo->size);
+#if 0
+ {
+ static float data[256];
+ draw_vf_emit_vertex(vbuf->vf, vertex, data);
+ if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
+ debug_printf("With VF:\n");
+ dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
+ debug_printf("Without VF:\n");
+ dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
+ assert(0);
+ }
+ }
+#endif
+ }
+ else {
+ draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
+
+ vbuf->vertex_ptr += vbuf->vertex_size/4;
}
- assert(count == vinfo->size);
}
@@ -269,6 +359,10 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
vbuf->vinfo = vinfo;
vbuf->vertex_size = vertex_size;
+ if(vbuf->vf)
+ draw_vf_set_vertex_info(vbuf->vf,
+ vbuf->vinfo,
+ vbuf->stage.draw->rasterizer->point_size);
if (!vbuf->vertices)
vbuf_alloc_vertices(vbuf);
@@ -423,7 +517,12 @@ static void vbuf_destroy( struct draw_stage *stage )
{
struct vbuf_stage *vbuf = vbuf_stage( stage );
- align_free( vbuf->indices );
+ if(vbuf->indices)
+ align_free( vbuf->indices );
+
+ if(vbuf->vf)
+ draw_vf_destroy( vbuf->vf );
+
FREE( stage );
}
@@ -436,6 +535,9 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
{
struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
+ if(!vbuf)
+ return NULL;
+
vbuf->stage.draw = draw;
vbuf->stage.point = vbuf_first_point;
vbuf->stage.line = vbuf_first_line;
@@ -450,11 +552,16 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
vbuf->max_indices = render->max_indices;
vbuf->indices = (ushort *)
align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
+ if(!vbuf->indices)
+ vbuf_destroy(&vbuf->stage);
vbuf->vertices = NULL;
vbuf->vertex_ptr = vbuf->vertices;
vbuf->prim = ~0;
+ if(!GETENV("GALLIUM_NOVF"))
+ vbuf->vf = draw_vf_create();
+
return &vbuf->stage;
}
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index fb64723a19..e13df04605 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -62,50 +62,244 @@ fetch_##NAME(const void *ptr, float *attrib) \
} \
}
+#define CVT_64_FLOAT (float) ((double *) ptr)[i]
#define CVT_32_FLOAT ((float *) ptr)[i]
+
+#define CVT_8_USCALED (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
#define CVT_8_UNORM (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT, 4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT, 3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT, 2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT, 1, CVT_64_FLOAT )
FETCH_ATTRIB( R32G32B32A32_FLOAT, 4, CVT_32_FLOAT )
FETCH_ATTRIB( R32G32B32_FLOAT, 3, CVT_32_FLOAT )
FETCH_ATTRIB( R32G32_FLOAT, 2, CVT_32_FLOAT )
FETCH_ATTRIB( R32_FLOAT, 1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED, 3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED, 2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED, 1, CVT_32_USCALED )
+
FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
FETCH_ATTRIB( R32G32B32_SSCALED, 3, CVT_32_SSCALED )
FETCH_ATTRIB( R32G32_SSCALED, 2, CVT_32_SSCALED )
FETCH_ATTRIB( R32_SSCALED, 1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM, 3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM, 2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM, 1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM, 3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM, 2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM, 1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED, 3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED, 2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED, 1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED, 3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED, 2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED, 1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM, 3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM, 2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM, 1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM, 3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM, 2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM, 1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED, 4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED, 3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED, 2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED, 1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED, 4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED, 3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED, 2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED, 1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM, 4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM, 3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM, 2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM, 1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM, 4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM, 3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM, 2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM, 1, CVT_8_SNORM )
+
FETCH_ATTRIB( A8R8G8B8_UNORM, 4, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8B8A8_UNORM, 4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM, 4, CVT_8_UNORM )
static fetch_func get_fetch_func( enum pipe_format format )
{
+#if 0
+ {
+ char tmp[80];
+ pf_sprint_name(tmp, format);
+ debug_printf("%s: %s\n", __FUNCTION__, tmp);
+ }
+#endif
+
switch (format) {
- case PIPE_FORMAT_R32G32B32A32_FLOAT:
- return fetch_R32G32B32A32_FLOAT;
- case PIPE_FORMAT_R32G32B32_FLOAT:
- return fetch_R32G32B32_FLOAT;
- case PIPE_FORMAT_R32G32_FLOAT:
- return fetch_R32G32_FLOAT;
+ case PIPE_FORMAT_R64_FLOAT:
+ return fetch_R64_FLOAT;
+ case PIPE_FORMAT_R64G64_FLOAT:
+ return fetch_R64G64_FLOAT;
+ case PIPE_FORMAT_R64G64B64_FLOAT:
+ return fetch_R64G64B64_FLOAT;
+ case PIPE_FORMAT_R64G64B64A64_FLOAT:
+ return fetch_R64G64B64A64_FLOAT;
+
case PIPE_FORMAT_R32_FLOAT:
return fetch_R32_FLOAT;
- case PIPE_FORMAT_R32G32B32A32_SSCALED:
- return fetch_R32G32B32A32_SSCALED;
- case PIPE_FORMAT_R32G32B32_SSCALED:
- return fetch_R32G32B32_SSCALED;
- case PIPE_FORMAT_R32G32_SSCALED:
- return fetch_R32G32_SSCALED;
+ case PIPE_FORMAT_R32G32_FLOAT:
+ return fetch_R32G32_FLOAT;
+ case PIPE_FORMAT_R32G32B32_FLOAT:
+ return fetch_R32G32B32_FLOAT;
+ case PIPE_FORMAT_R32G32B32A32_FLOAT:
+ return fetch_R32G32B32A32_FLOAT;
+
+ case PIPE_FORMAT_R32_UNORM:
+ return fetch_R32_UNORM;
+ case PIPE_FORMAT_R32G32_UNORM:
+ return fetch_R32G32_UNORM;
+ case PIPE_FORMAT_R32G32B32_UNORM:
+ return fetch_R32G32B32_UNORM;
+ case PIPE_FORMAT_R32G32B32A32_UNORM:
+ return fetch_R32G32B32A32_UNORM;
+
+ case PIPE_FORMAT_R32_USCALED:
+ return fetch_R32_USCALED;
+ case PIPE_FORMAT_R32G32_USCALED:
+ return fetch_R32G32_USCALED;
+ case PIPE_FORMAT_R32G32B32_USCALED:
+ return fetch_R32G32B32_USCALED;
+ case PIPE_FORMAT_R32G32B32A32_USCALED:
+ return fetch_R32G32B32A32_USCALED;
+
+ case PIPE_FORMAT_R32_SNORM:
+ return fetch_R32_SNORM;
+ case PIPE_FORMAT_R32G32_SNORM:
+ return fetch_R32G32_SNORM;
+ case PIPE_FORMAT_R32G32B32_SNORM:
+ return fetch_R32G32B32_SNORM;
+ case PIPE_FORMAT_R32G32B32A32_SNORM:
+ return fetch_R32G32B32A32_SNORM;
+
case PIPE_FORMAT_R32_SSCALED:
return fetch_R32_SSCALED;
- case PIPE_FORMAT_A8R8G8B8_UNORM:
- return fetch_A8R8G8B8_UNORM;
+ case PIPE_FORMAT_R32G32_SSCALED:
+ return fetch_R32G32_SSCALED;
+ case PIPE_FORMAT_R32G32B32_SSCALED:
+ return fetch_R32G32B32_SSCALED;
+ case PIPE_FORMAT_R32G32B32A32_SSCALED:
+ return fetch_R32G32B32A32_SSCALED;
+
+ case PIPE_FORMAT_R16_UNORM:
+ return fetch_R16_UNORM;
+ case PIPE_FORMAT_R16G16_UNORM:
+ return fetch_R16G16_UNORM;
+ case PIPE_FORMAT_R16G16B16_UNORM:
+ return fetch_R16G16B16_UNORM;
+ case PIPE_FORMAT_R16G16B16A16_UNORM:
+ return fetch_R16G16B16A16_UNORM;
+
+ case PIPE_FORMAT_R16_USCALED:
+ return fetch_R16_USCALED;
+ case PIPE_FORMAT_R16G16_USCALED:
+ return fetch_R16G16_USCALED;
+ case PIPE_FORMAT_R16G16B16_USCALED:
+ return fetch_R16G16B16_USCALED;
+ case PIPE_FORMAT_R16G16B16A16_USCALED:
+ return fetch_R16G16B16A16_USCALED;
+
+ case PIPE_FORMAT_R16_SNORM:
+ return fetch_R16_SNORM;
+ case PIPE_FORMAT_R16G16_SNORM:
+ return fetch_R16G16_SNORM;
+ case PIPE_FORMAT_R16G16B16_SNORM:
+ return fetch_R16G16B16_SNORM;
+ case PIPE_FORMAT_R16G16B16A16_SNORM:
+ return fetch_R16G16B16A16_SNORM;
+
+ case PIPE_FORMAT_R16_SSCALED:
+ return fetch_R16_SSCALED;
+ case PIPE_FORMAT_R16G16_SSCALED:
+ return fetch_R16G16_SSCALED;
+ case PIPE_FORMAT_R16G16B16_SSCALED:
+ return fetch_R16G16B16_SSCALED;
+ case PIPE_FORMAT_R16G16B16A16_SSCALED:
+ return fetch_R16G16B16A16_SSCALED;
+
+ case PIPE_FORMAT_R8_UNORM:
+ return fetch_R8_UNORM;
+ case PIPE_FORMAT_R8G8_UNORM:
+ return fetch_R8G8_UNORM;
+ case PIPE_FORMAT_R8G8B8_UNORM:
+ return fetch_R8G8B8_UNORM;
case PIPE_FORMAT_R8G8B8A8_UNORM:
return fetch_R8G8B8A8_UNORM;
+
+ case PIPE_FORMAT_R8_USCALED:
+ return fetch_R8_USCALED;
+ case PIPE_FORMAT_R8G8_USCALED:
+ return fetch_R8G8_USCALED;
+ case PIPE_FORMAT_R8G8B8_USCALED:
+ return fetch_R8G8B8_USCALED;
+ case PIPE_FORMAT_R8G8B8A8_USCALED:
+ return fetch_R8G8B8A8_USCALED;
+
+ case PIPE_FORMAT_R8_SNORM:
+ return fetch_R8_SNORM;
+ case PIPE_FORMAT_R8G8_SNORM:
+ return fetch_R8G8_SNORM;
+ case PIPE_FORMAT_R8G8B8_SNORM:
+ return fetch_R8G8B8_SNORM;
+ case PIPE_FORMAT_R8G8B8A8_SNORM:
+ return fetch_R8G8B8A8_SNORM;
+
+ case PIPE_FORMAT_R8_SSCALED:
+ return fetch_R8_SSCALED;
+ case PIPE_FORMAT_R8G8_SSCALED:
+ return fetch_R8G8_SSCALED;
+ case PIPE_FORMAT_R8G8B8_SSCALED:
+ return fetch_R8G8B8_SSCALED;
+ case PIPE_FORMAT_R8G8B8A8_SSCALED:
+ return fetch_R8G8B8A8_SSCALED;
+
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ return fetch_A8R8G8B8_UNORM;
+
case 0:
- return NULL;
+ return NULL; /* not sure why this is needed */
+
default:
- /* Lots of missing cases! */
assert(0);
return NULL;
}
@@ -126,47 +320,108 @@ transpose_4x4( float *out, const float *in )
}
-
-void draw_update_vertex_fetch( struct draw_context *draw )
+
+static void fetch_xyz_rgb( struct draw_context *draw,
+ struct tgsi_exec_machine *machine,
+ const unsigned *elts,
+ unsigned count )
{
- unsigned nr_attrs, i;
+ const unsigned *pitch = draw->vertex_fetch.pitch;
+ const ubyte **src = draw->vertex_fetch.src_ptr;
+ int i;
- /* this may happend during context init */
- if (!draw->vertex_shader)
- return;
+ assert(count <= 4);
- nr_attrs = draw->vertex_shader->state->num_inputs;
+// debug_printf("%s\n", __FUNCTION__);
- for (i = 0; i < nr_attrs; i++) {
- unsigned buf = draw->vertex_element[i].vertex_buffer_index;
- enum pipe_format format = draw->vertex_element[i].src_format;
+ /* loop over vertex attributes (vertex shader inputs)
+ */
- draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] +
- draw->vertex_buffer[buf].buffer_offset +
- draw->vertex_element[i].src_offset;
+ for (i = 0; i < 4; i++) {
+ {
+ const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+ float *out = &machine->Inputs[0].xyzw[0].f[i];
+ out[0] = in[0];
+ out[4] = in[1];
+ out[8] = in[2];
+ out[12] = 1.0f;
+ }
- draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
- draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+ {
+ const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+ float *out = &machine->Inputs[1].xyzw[0].f[i];
+ out[0] = in[0];
+ out[4] = in[1];
+ out[8] = in[2];
+ out[12] = 1.0f;
+ }
}
+}
- draw->vertex_fetch.nr_attrs = nr_attrs;
+
+
+
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+ struct tgsi_exec_machine *machine,
+ const unsigned *elts,
+ unsigned count )
+{
+ const unsigned *pitch = draw->vertex_fetch.pitch;
+ const ubyte **src = draw->vertex_fetch.src_ptr;
+ int i;
+
+ assert(count <= 4);
+
+ /* loop over vertex attributes (vertex shader inputs)
+ */
+
+ for (i = 0; i < 4; i++) {
+ {
+ const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+ float *out = &machine->Inputs[0].xyzw[0].f[i];
+ out[0] = in[0];
+ out[4] = in[1];
+ out[8] = in[2];
+ out[12] = 1.0f;
+ }
+
+ {
+ const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+ float *out = &machine->Inputs[1].xyzw[0].f[i];
+ out[0] = in[0];
+ out[4] = in[1];
+ out[8] = in[2];
+ out[12] = 1.0f;
+ }
+
+ {
+ const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+ float *out = &machine->Inputs[2].xyzw[0].f[i];
+ out[0] = in[0];
+ out[4] = in[1];
+ out[8] = 0.0f;
+ out[12] = 1.0f;
+ }
+ }
}
+
+
/**
* Fetch vertex attributes for 'count' vertices.
*/
-void draw_vertex_fetch( struct draw_context *draw,
- struct tgsi_exec_machine *machine,
- const unsigned *elts,
- unsigned count )
+static void generic_vertex_fetch( struct draw_context *draw,
+ struct tgsi_exec_machine *machine,
+ const unsigned *elts,
+ unsigned count )
{
unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
unsigned attr;
assert(count <= 4);
-// _mesa_printf("%s %d\n", __FUNCTION__, count);
+// debug_printf("%s %d\n", __FUNCTION__, count);
/* loop over vertex attributes (vertex shader inputs)
*/
@@ -206,3 +461,50 @@ void draw_vertex_fetch( struct draw_context *draw,
}
}
+
+
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+ unsigned nr_attrs, i;
+
+// debug_printf("%s\n", __FUNCTION__);
+
+ /* this may happend during context init */
+ if (!draw->vertex_shader)
+ return;
+
+ nr_attrs = draw->vertex_shader->state->num_inputs;
+
+ for (i = 0; i < nr_attrs; i++) {
+ unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+ enum pipe_format format = draw->vertex_element[i].src_format;
+
+ draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] +
+ draw->vertex_buffer[buf].buffer_offset +
+ draw->vertex_element[i].src_offset;
+
+ draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+ draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+ }
+
+ draw->vertex_fetch.nr_attrs = nr_attrs;
+
+ draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+ switch (nr_attrs) {
+ case 2:
+ if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+ draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
+ draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+ break;
+ case 3:
+ if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+ draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+ draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
+ draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+ break;
+ default:
+ break;
+ }
+
+}
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 3041974b9a..5ca93aa615 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -110,7 +110,7 @@ run_vertex_program(struct draw_context *draw,
machine->Inputs = ALIGN16_ASSIGN(inputs);
machine->Outputs = ALIGN16_ASSIGN(outputs);
- draw_vertex_fetch( draw, machine, elts, count );
+ draw->vertex_fetch.fetch_func( draw, machine, elts, count );
/* run shader */
#if defined(__i386__) || defined(__386__)
@@ -121,11 +121,16 @@ run_vertex_program(struct draw_context *draw,
= (struct draw_vertex_shader *)draw->vertex_shader;
codegen_function func
= (codegen_function) x86_get_func( &shader->sse2_program );
- func(
- machine->Inputs,
- machine->Outputs,
- machine->Consts,
- machine->Temps );
+
+ if (func)
+ func(
+ machine->Inputs,
+ machine->Outputs,
+ machine->Consts,
+ machine->Temps );
+ else
+ /* interpreter */
+ tgsi_exec_machine_run( machine );
}
else
#endif
@@ -166,7 +171,7 @@ run_vertex_program(struct draw_context *draw,
vOut[j]->data[0][3] = w;
#if DBG_VS
- printf("output[%d]win: %f %f %f %f\n", j,
+ debug_printf("output[%d]win: %f %f %f %f\n", j,
vOut[j]->data[0][0],
vOut[j]->data[0][1],
vOut[j]->data[0][2],
@@ -181,7 +186,7 @@ run_vertex_program(struct draw_context *draw,
vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
#if DBG_VS
- printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+ debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot,
vOut[j]->data[slot][0],
vOut[j]->data[slot][1],
vOut[j]->data[slot][2],
@@ -199,13 +204,15 @@ run_vertex_program(struct draw_context *draw,
void
draw_vertex_shader_queue_flush(struct draw_context *draw)
{
- unsigned i, j;
+ unsigned i;
+
+ assert(draw->vs.queue_nr != 0);
/* XXX: do this on statechange:
*/
draw_update_vertex_fetch( draw );
-// fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
+// debug_printf( " q(%d) ", draw->vs.queue_nr );
#ifdef MESA_LLVM
if (draw->vertex_shader->llvm_prog) {
draw_vertex_shader_queue_flush_llvm(draw);
@@ -217,14 +224,18 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
for (i = 0; i < draw->vs.queue_nr; i += 4) {
struct vertex_header *dests[4];
unsigned elts[4];
- int n;
+ int j, n = MIN2(4, draw->vs.queue_nr - i);
- for (j = 0; j < 4; j++) {
+ for (j = 0; j < n; j++) {
elts[j] = draw->vs.queue[i + j].elt;
dests[j] = draw->vs.queue[i + j].dest;
}
- n = MIN2(4, draw->vs.queue_nr - i);
+ for ( ; j < 4; j++) {
+ elts[j] = elts[0];
+ dests[j] = dests[0];
+ }
+
assert(n > 0);
assert(n <= 4);
@@ -263,7 +274,12 @@ draw_create_vertex_shader(struct draw_context *draw,
struct pipe_shader_state *sh = (struct pipe_shader_state *) shader;
x86_init_func( &vs->sse2_program );
- tgsi_emit_sse2( (struct tgsi_token *) sh->tokens, &vs->sse2_program );
+ if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens,
+ &vs->sse2_program )) {
+ x86_release_func( (struct x86_function *) &vs->sse2_program );
+ fprintf(stdout /*err*/,
+ "tgsi_emit_sse2() failed, falling back to interpreter\n");
+ }
}
#endif
diff --git a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
index 4228c4f388..63551c993e 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
@@ -152,7 +152,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
z = vOut->clip[2] = dests[0][2];
w = vOut->clip[3] = dests[0][3];
#if DBG
- printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
+ debug_printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
#endif
vOut->clipmask = compute_clipmask(vOut->clip, draw->plane, draw->nr_planes);
@@ -179,7 +179,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
vOut->data[slot][3] = dests[slot][3];
#if DBG
- printf("output %d: %f %f %f %f\n", slot,
+ debug_printf("output %d: %f %f %f %f\n", slot,
vOut->data[slot][0],
vOut->data[slot][1],
vOut->data[slot][2],
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
new file mode 100644
index 0000000000..f23d7fcec5
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include <stddef.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+#define DRAW_VF_DBG 0
+
+
+/* TODO: remove this */
+extern void
+_mesa_exec_free( void *addr );
+
+
+static boolean match_fastpath( struct draw_vertex_fetch *vf,
+ const struct draw_vf_fastpath *fp)
+{
+ unsigned j;
+
+ if (vf->attr_count != fp->attr_count)
+ return FALSE;
+
+ for (j = 0; j < vf->attr_count; j++)
+ if (vf->attr[j].format != fp->attr[j].format ||
+ vf->attr[j].inputsize != fp->attr[j].size ||
+ vf->attr[j].vertoffset != fp->attr[j].offset)
+ return FALSE;
+
+ if (fp->match_strides) {
+ if (vf->vertex_stride != fp->vertex_stride)
+ return FALSE;
+
+ for (j = 0; j < vf->attr_count; j++)
+ if (vf->attr[j].inputstride != fp->attr[j].stride)
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
+{
+ struct draw_vf_fastpath *fp = vf->fastpath;
+
+ for ( ; fp ; fp = fp->next) {
+ if (match_fastpath(vf, fp)) {
+ vf->emit = fp->func;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
+ boolean match_strides )
+{
+ struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
+ unsigned i;
+
+ fastpath->vertex_stride = vf->vertex_stride;
+ fastpath->attr_count = vf->attr_count;
+ fastpath->match_strides = match_strides;
+ fastpath->func = vf->emit;
+ fastpath->attr = (struct draw_vf_attr_type *)
+ MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
+
+ for (i = 0; i < vf->attr_count; i++) {
+ fastpath->attr[i].format = vf->attr[i].format;
+ fastpath->attr[i].stride = vf->attr[i].inputstride;
+ fastpath->attr[i].size = vf->attr[i].inputsize;
+ fastpath->attr[i].offset = vf->attr[i].vertoffset;
+ }
+
+ fastpath->next = vf->fastpath;
+ vf->fastpath = fastpath;
+}
+
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct draw_vertex_fetch *vf,
+ unsigned count,
+ uint8_t *dest)
+{
+ vf->emit = NULL;
+
+ /* Does this match an existing (hardwired, codegen or known-bad)
+ * fastpath?
+ */
+ if (search_fastpath_emit(vf)) {
+ /* Use this result. If it is null, then it is already known
+ * that the current state will fail for codegen and there is no
+ * point trying again.
+ */
+ }
+ else if (vf->codegen_emit) {
+ vf->codegen_emit( vf );
+ }
+
+ if (!vf->emit) {
+ draw_vf_generate_hardwired_emit(vf);
+ }
+
+ /* Otherwise use the generic version:
+ */
+ if (!vf->emit)
+ vf->emit = draw_vf_generic_emit;
+
+ vf->emit( vf, count, dest );
+}
+
+
+
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+
+static unsigned
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+ const struct draw_vf_attr_map *map,
+ unsigned nr,
+ unsigned vertex_stride )
+{
+ unsigned offset = 0;
+ unsigned i, j;
+
+ assert(nr < PIPE_ATTRIB_MAX);
+
+ for (j = 0, i = 0; i < nr; i++) {
+ const unsigned format = map[i].format;
+ if (format == DRAW_EMIT_PAD) {
+#if (DRAW_VF_DBG)
+ debug_printf("%d: pad %d, offset %d\n", i,
+ map[i].offset, offset);
+#endif
+
+ offset += map[i].offset;
+
+ }
+ else {
+ vf->attr[j].attrib = map[i].attrib;
+ vf->attr[j].format = format;
+ vf->attr[j].insert = draw_vf_format_info[format].insert;
+ vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
+ vf->attr[j].vertoffset = offset;
+ vf->attr[j].isconst = draw_vf_format_info[format].isconst;
+ if(vf->attr[j].isconst)
+ memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
+
+#if (DRAW_VF_DBG)
+ debug_printf("%d: %s, offset %d\n", i,
+ draw_vf_format_info[format].name,
+ vf->attr[j].vertoffset);
+#endif
+
+ offset += draw_vf_format_info[format].attrsize;
+ j++;
+ }
+ }
+
+ vf->attr_count = j;
+ vf->vertex_stride = vertex_stride ? vertex_stride : offset;
+ vf->emit = choose_emit_func;
+
+ assert(vf->vertex_stride >= offset);
+ return vf->vertex_stride;
+}
+
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf,
+ const struct vertex_info *vinfo,
+ float point_size )
+{
+ unsigned i, j, k;
+ struct draw_vf_attr *a = vf->attr;
+ struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+ unsigned count = 0; /* for debug/sanity */
+ unsigned nr_attrs = 0;
+
+ for (i = 0; i < vinfo->num_attribs; i++) {
+ j = vinfo->src_index[i];
+ switch (vinfo->emit[i]) {
+ case EMIT_OMIT:
+ /* no-op */
+ break;
+ case EMIT_ALL: {
+ /* just copy the whole vertex as-is to the vbuf */
+ unsigned s = vinfo->size;
+ assert(i == 0);
+ assert(j == 0);
+ /* copy the vertex header */
+ /* XXX: we actually don't copy the header, just pad it */
+ attrs[nr_attrs].attrib = 0;
+ attrs[nr_attrs].format = DRAW_EMIT_PAD;
+ attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+ s -= offsetof(struct vertex_header, data)/4;
+ count += offsetof(struct vertex_header, data)/4;
+ nr_attrs++;
+ /* copy the vertex data */
+ for(k = 0; k < (s & ~0x3); k += 4) {
+ attrs[nr_attrs].attrib = k/4;
+ attrs[nr_attrs].format = DRAW_EMIT_4F;
+ attrs[nr_attrs].offset = 0;
+ nr_attrs++;
+ count += 4;
+ }
+ /* tail */
+ /* XXX: actually, this shouldn't be needed */
+ attrs[nr_attrs].attrib = k/4;
+ attrs[nr_attrs].offset = 0;
+ switch(s & 0x3) {
+ case 0:
+ break;
+ case 1:
+ attrs[nr_attrs].format = DRAW_EMIT_1F;
+ nr_attrs++;
+ count += 1;
+ break;
+ case 2:
+ attrs[nr_attrs].format = DRAW_EMIT_2F;
+ nr_attrs++;
+ count += 2;
+ break;
+ case 3:
+ attrs[nr_attrs].format = DRAW_EMIT_3F;
+ nr_attrs++;
+ count += 3;
+ break;
+ }
+ break;
+ }
+ case EMIT_1F:
+ attrs[nr_attrs].attrib = j;
+ attrs[nr_attrs].format = DRAW_EMIT_1F;
+ attrs[nr_attrs].offset = 0;
+ nr_attrs++;
+ count++;
+ break;
+ case EMIT_1F_PSIZE:
+ attrs[nr_attrs].attrib = j;
+ attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
+ attrs[nr_attrs].offset = 0;
+ attrs[nr_attrs].data.f[0] = point_size;
+ nr_attrs++;
+ count++;
+ break;
+ case EMIT_2F:
+ attrs[nr_attrs].attrib = j;
+ attrs[nr_attrs].format = DRAW_EMIT_2F;
+ attrs[nr_attrs].offset = 0;
+ nr_attrs++;
+ count += 2;
+ break;
+ case EMIT_3F:
+ attrs[nr_attrs].attrib = j;
+ attrs[nr_attrs].format = DRAW_EMIT_3F;
+ attrs[nr_attrs].offset = 0;
+ nr_attrs++;
+ count += 3;
+ break;
+ case EMIT_4F:
+ attrs[nr_attrs].attrib = j;
+ attrs[nr_attrs].format = DRAW_EMIT_4F;
+ attrs[nr_attrs].offset = 0;
+ nr_attrs++;
+ count += 4;
+ break;
+ case EMIT_4UB:
+ attrs[nr_attrs].attrib = j;
+ attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+ attrs[nr_attrs].offset = 0;
+ nr_attrs++;
+ count += 1;
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ assert(count == vinfo->size);
+
+ draw_vf_set_vertex_attributes(vf,
+ attrs,
+ nr_attrs,
+ vinfo->size * sizeof(float) );
+
+ for (j = 0; j < vf->attr_count; j++) {
+ a[j].inputsize = 4;
+ a[j].do_insert = a[j].insert[4 - 1];
+ if(a[j].isconst) {
+ a[j].inputptr = a[j].data;
+ a[j].inputstride = 0;
+ }
+ }
+}
+
+
+#if 0
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_sources( struct draw_vertex_fetch *vf,
+ GLvector4f * const sources[],
+ unsigned start )
+{
+ struct draw_vf_attr *a = vf->attr;
+ unsigned j;
+
+ for (j = 0; j < vf->attr_count; j++) {
+ const GLvector4f *vptr = sources[a[j].attrib];
+
+ if ((a[j].inputstride != vptr->stride) ||
+ (a[j].inputsize != vptr->size))
+ vf->emit = choose_emit_func;
+
+ a[j].inputstride = vptr->stride;
+ a[j].inputsize = vptr->size;
+ a[j].do_insert = a[j].insert[vptr->size - 1];
+ a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
+ }
+}
+#endif
+
+
+/**
+ * Emit a vertex to dest.
+ */
+void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+ struct vertex_header *vertex,
+ void *dest )
+{
+ struct draw_vf_attr *a = vf->attr;
+ unsigned j;
+
+ for (j = 0; j < vf->attr_count; j++) {
+ if(!a[j].isconst) {
+ a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
+ a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
+ }
+ }
+
+ vf->emit( vf, 1, (uint8_t*) dest );
+}
+
+
+
+struct draw_vertex_fetch *draw_vf_create( void )
+{
+ struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
+ unsigned i;
+
+ for (i = 0; i < PIPE_ATTRIB_MAX; i++)
+ vf->attr[i].vf = vf;
+
+ vf->identity[0] = 0.0;
+ vf->identity[1] = 0.0;
+ vf->identity[2] = 0.0;
+ vf->identity[3] = 1.0;
+
+ vf->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+ if (!GETENV("GALLIUM_NO_CODEGEN"))
+ vf->codegen_emit = draw_vf_generate_sse_emit;
+#endif
+
+ return vf;
+}
+
+
+void draw_vf_destroy( struct draw_vertex_fetch *vf )
+{
+ struct draw_vf_fastpath *fp, *tmp;
+
+ for (fp = vf->fastpath ; fp ; fp = tmp) {
+ tmp = fp->next;
+ FREE(fp->attr);
+
+ /* KW: At the moment, fp->func is constrained to be allocated by
+ * _mesa_exec_alloc(), as the hardwired fastpaths in
+ * t_vertex_generic.c are handled specially. It would be nice
+ * to unify them, but this probably won't change until this
+ * module gets another overhaul.
+ */
+ //_mesa_exec_free((void *) fp->func);
+ FREE(fp);
+ }
+
+ vf->fastpath = NULL;
+ FREE(vf);
+}
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
new file mode 100644
index 0000000000..e694b98675
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VF_H
+#define DRAW_VF_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "draw_vertex.h"
+#include "draw_private.h" // for vertex_header
+
+
+enum draw_vf_attr_format {
+ DRAW_EMIT_1F,
+ DRAW_EMIT_2F,
+ DRAW_EMIT_3F,
+ DRAW_EMIT_4F,
+ DRAW_EMIT_3F_XYW, /**< for projective texture */
+ DRAW_EMIT_1UB_1F, /**< for fog coordinate */
+ DRAW_EMIT_3UB_3F_RGB, /**< for specular color */
+ DRAW_EMIT_3UB_3F_BGR, /**< for specular color */
+ DRAW_EMIT_4UB_4F_RGBA, /**< for color */
+ DRAW_EMIT_4UB_4F_BGRA, /**< for color */
+ DRAW_EMIT_4UB_4F_ARGB, /**< for color */
+ DRAW_EMIT_4UB_4F_ABGR, /**< for color */
+ DRAW_EMIT_1F_CONST,
+ DRAW_EMIT_2F_CONST,
+ DRAW_EMIT_3F_CONST,
+ DRAW_EMIT_4F_CONST,
+ DRAW_EMIT_PAD, /**< leave a hole of 'offset' bytes */
+ DRAW_EMIT_MAX
+};
+
+struct draw_vf_attr_map
+{
+ /** Input attribute number */
+ unsigned attrib;
+
+ enum draw_vf_attr_format format;
+
+ unsigned offset;
+
+ /**
+ * Constant data for DRAW_EMIT_*_CONST
+ */
+ union {
+ uint8_t ub[4];
+ float f[4];
+ } data;
+};
+
+struct draw_vertex_fetch;
+
+
+
+#if 0
+unsigned
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+ const struct draw_vf_attr_map *map,
+ unsigned nr,
+ unsigned vertex_stride );
+#endif
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf,
+ const struct vertex_info *vinfo,
+ float point_size );
+
+#if 0
+void
+draw_vf_set_sources( struct draw_vertex_fetch *vf,
+ GLvector4f * const attrib[],
+ unsigned start );
+#endif
+
+void
+draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+ struct vertex_header *vertex,
+ void *dest );
+
+struct draw_vertex_fetch *
+draw_vf_create( void );
+
+void
+draw_vf_destroy( struct draw_vertex_fetch *vf );
+
+
+
+/***********************************************************************
+ * Internal functions and structs:
+ */
+
+struct draw_vf_attr;
+
+typedef void (*draw_vf_extract_func)( const struct draw_vf_attr *a,
+ float *out,
+ const uint8_t *v );
+
+typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a,
+ uint8_t *v,
+ const float *in );
+
+typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
+ unsigned count,
+ uint8_t *dest );
+
+
+
+/**
+ * Describes how to convert/move a vertex attribute from a vertex
+ * array to a vertex structure.
+ */
+struct draw_vf_attr
+{
+ struct draw_vertex_fetch *vf;
+
+ unsigned format;
+ unsigned inputsize;
+ unsigned inputstride;
+ unsigned vertoffset; /**< position of the attrib in the vertex struct */
+
+ boolean isconst; /**< read from const data below */
+ uint8_t data[16];
+
+ unsigned attrib; /**< which vertex attrib (0=position, etc) */
+ unsigned vertattrsize; /**< size of the attribute in bytes */
+
+ uint8_t *inputptr;
+ const draw_vf_insert_func *insert;
+ draw_vf_insert_func do_insert;
+ draw_vf_extract_func extract;
+};
+
+struct draw_vertex_fetch
+{
+ struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
+ unsigned attr_count;
+ unsigned vertex_stride;
+
+ draw_vf_emit_func emit;
+
+ /* Parameters and constants for codegen:
+ */
+ float identity[4];
+
+ struct draw_vf_fastpath *fastpath;
+
+ void (*codegen_emit)( struct draw_vertex_fetch *vf );
+};
+
+
+struct draw_vf_attr_type {
+ unsigned format;
+ unsigned size;
+ unsigned stride;
+ unsigned offset;
+};
+
+struct draw_vf_fastpath {
+ unsigned vertex_stride;
+ unsigned attr_count;
+ boolean match_strides;
+
+ struct draw_vf_attr_type *attr;
+
+ draw_vf_emit_func func;
+ struct draw_vf_fastpath *next;
+};
+
+
+void
+draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
+ boolean match_strides );
+
+void
+draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+ unsigned count,
+ uint8_t *v );
+
+void
+draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
+
+void
+draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
+
+
+struct draw_vf_format_info {
+ const char *name;
+ draw_vf_insert_func insert[4];
+ const unsigned attrsize;
+ const boolean isconst;
+};
+
+extern const struct draw_vf_format_info
+draw_vf_format_info[DRAW_EMIT_MAX];
+
+
+#endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
new file mode 100644
index 0000000000..7a60a9db9c
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -0,0 +1,585 @@
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+
+static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = 0;
+ out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = 0;
+ out[2] = 0;
+ out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ (void) a; (void) v; (void) in;
+ assert(0);
+}
+
+static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = 0;
+ out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+ out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ float *out = (float *)(v);
+ (void) a;
+
+ out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+ (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+ v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ v[2] = 0;
+ v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+ v[1] = 0;
+ v[2] = 0;
+ v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+ v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ v[0] = 0;
+ v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+ v[1] = 0;
+ v[0] = 0;
+ v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+ v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+ v[3] = 0x00;
+ v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+ v[2] = 0x00;
+ v[3] = 0x00;
+ v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+ v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+ v[1] = 0x00;
+ v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+ v[2] = 0x00;
+ v[1] = 0x00;
+ v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+ v[1] = 0;
+ v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+ UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+ v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+ v[1] = 0;
+ v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v,
+ const float *in )
+{
+ (void) a;
+ UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] =
+{
+ { "1f",
+ { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+ sizeof(float), FALSE },
+
+ { "2f",
+ { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+ 2 * sizeof(float), FALSE },
+
+ { "3f",
+ { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+ 3 * sizeof(float), FALSE },
+
+ { "4f",
+ { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+ 4 * sizeof(float), FALSE },
+
+ { "3f_xyw",
+ { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err,
+ insert_3f_xyw_4 },
+ 3 * sizeof(float), FALSE },
+
+ { "1ub_1f",
+ { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+ sizeof(uint8_t), FALSE },
+
+ { "3ub_3f_rgb",
+ { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+ insert_3ub_3f_rgb_3 },
+ 3 * sizeof(uint8_t), FALSE },
+
+ { "3ub_3f_bgr",
+ { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+ insert_3ub_3f_bgr_3 },
+ 3 * sizeof(uint8_t), FALSE },
+
+ { "4ub_4f_rgba",
+ { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3,
+ insert_4ub_4f_rgba_4 },
+ 4 * sizeof(uint8_t), FALSE },
+
+ { "4ub_4f_bgra",
+ { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+ insert_4ub_4f_bgra_4 },
+ 4 * sizeof(uint8_t), FALSE },
+
+ { "4ub_4f_argb",
+ { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+ insert_4ub_4f_argb_4 },
+ 4 * sizeof(uint8_t), FALSE },
+
+ { "4ub_4f_abgr",
+ { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+ insert_4ub_4f_abgr_4 },
+ 4 * sizeof(uint8_t), FALSE },
+
+ { "1f_const",
+ { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+ sizeof(float), TRUE },
+
+ { "2f_const",
+ { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+ 2 * sizeof(float), TRUE },
+
+ { "3f_const",
+ { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+ 3 * sizeof(float), TRUE },
+
+ { "4f_const",
+ { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+ 4 * sizeof(float), TRUE },
+
+ { "pad",
+ { NULL, NULL, NULL, NULL },
+ 0, FALSE },
+
+};
+
+
+
+
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME) \
+static void NAME( struct draw_vertex_fetch *vf, \
+ unsigned count, \
+ uint8_t *v ) \
+{ \
+ struct draw_vf_attr *a = vf->attr; \
+ unsigned i; \
+ \
+ for (i = 0 ; i < count ; i++, v += vf->vertex_stride) { \
+ if (NR > 0) { \
+ F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr ); \
+ a[0].inputptr += a[0].inputstride; \
+ } \
+ \
+ if (NR > 1) { \
+ F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr ); \
+ a[1].inputptr += a[1].inputstride; \
+ } \
+ \
+ if (NR > 2) { \
+ F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr ); \
+ a[2].inputptr += a[2].inputstride; \
+ } \
+ \
+ if (NR > 3) { \
+ F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr ); \
+ a[3].inputptr += a[3].inputstride; \
+ } \
+ \
+ if (NR > 4) { \
+ F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr ); \
+ a[4].inputptr += a[4].inputstride; \
+ } \
+ } \
+}
+
+
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+ insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+ insert_null, NAME)
+
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+ insert_null, NAME)
+
+
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
+{
+ draw_vf_emit_func func = NULL;
+
+ /* Does it fit a hardwired fastpath? Help! this is growing out of
+ * control!
+ */
+ switch (vf->attr_count) {
+ case 2:
+ if (vf->attr[0].do_insert == insert_3f_3 &&
+ vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ func = emit_xyz3_rgba4;
+ }
+ break;
+ case 3:
+ if (vf->attr[2].do_insert == insert_2f_2) {
+ if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ if (vf->attr[0].do_insert == insert_4f_4)
+ func = emit_xyzw4_rgba4_st2;
+ }
+ }
+ break;
+ case 4:
+ if (vf->attr[2].do_insert == insert_2f_2 &&
+ vf->attr[3].do_insert == insert_2f_2) {
+ if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ if (vf->attr[0].do_insert == insert_4f_4)
+ func = emit_xyzw4_rgba4_st2_st2;
+ }
+ }
+ break;
+ }
+
+ vf->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+ unsigned count,
+ uint8_t *v )
+{
+ struct draw_vf_attr *a = vf->attr;
+ const unsigned attr_count = vf->attr_count;
+ const unsigned stride = vf->vertex_stride;
+ unsigned i, j;
+
+ for (i = 0 ; i < count ; i++, v += stride) {
+ for (j = 0; j < attr_count; j++) {
+ float *in = (float *)a[j].inputptr;
+ a[j].inputptr += a[j].inputstride;
+ a[j].do_insert( &a[j], v + a[j].vertoffset, in );
+ }
+ }
+}
+
+
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
new file mode 100644
index 0000000000..1ad2ae756d
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "simple_list.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+#define X 0
+#define Y 1
+#define Z 2
+#define W 3
+
+
+struct x86_program {
+ struct x86_function func;
+
+ struct draw_vertex_fetch *vf;
+ boolean inputs_safe;
+ boolean outputs_safe;
+ boolean have_sse2;
+
+ struct x86_reg identity;
+ struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+ return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ /* Have to jump through some hoops:
+ *
+ * c 0 0 0
+ * c 0 0 1
+ * 0 0 c 1
+ * a b c 1
+ */
+ sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+ sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+ sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+ sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ /* Initialize from identity, then pull in low two words:
+ */
+ sse_movups(&p->func, dest, get_identity(p));
+ sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ /* Pull in low word, then swizzle in identity */
+ sse_movss(&p->func, dest, arg0);
+ sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ /* Over-reads by 1 dword - potential SEGV if input is a vertex
+ * array.
+ */
+ if (p->inputs_safe) {
+ sse_movups(&p->func, dest, arg0);
+ }
+ else {
+ /* c 0 0 0
+ * c c c c
+ * a b c c
+ */
+ sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+ sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+ sse_movlps(&p->func, dest, arg0);
+ }
+}
+
+static void emit_load3f_2( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 ) = {
+ { emit_load1f_1,
+ emit_load1f_1,
+ emit_load1f_1,
+ emit_load1f_1 },
+
+ { emit_load2f_1,
+ emit_load2f_2,
+ emit_load2f_2,
+ emit_load2f_2 },
+
+ { emit_load3f_1,
+ emit_load3f_2,
+ emit_load3f_3,
+ emit_load3f_3 },
+
+ { emit_load4f_1,
+ emit_load4f_2,
+ emit_load4f_3,
+ emit_load4f_4 }
+};
+
+static void emit_load( struct x86_program *p,
+ struct x86_reg dest,
+ unsigned sz,
+ struct x86_reg src,
+ unsigned src_sz)
+{
+ load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ if (p->outputs_safe) {
+ /* Emit the extra dword anyway. This may hurt writecombining,
+ * may cause other problems.
+ */
+ sse_movups(&p->func, dest, arg0);
+ }
+ else {
+ /* Alternate strategy - emit two, shuffle, emit one.
+ */
+ sse_movlps(&p->func, dest, arg0);
+ sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+ sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+ }
+}
+
+static void emit_store2f( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 )
+{
+ sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg arg0 ) =
+{
+ emit_store1f,
+ emit_store2f,
+ emit_store3f,
+ emit_store4f
+};
+
+static void emit_store( struct x86_program *p,
+ struct x86_reg dest,
+ unsigned sz,
+ struct x86_reg temp )
+
+{
+ store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+ struct x86_reg dest,
+ struct x86_reg temp )
+{
+ /* Scale by 255.0
+ */
+ sse_mulps(&p->func, temp, p->chan0);
+
+ if (p->have_sse2) {
+ sse2_cvtps2dq(&p->func, temp, temp);
+ sse2_packssdw(&p->func, temp, temp);
+ sse2_packuswb(&p->func, temp, temp);
+ sse_movss(&p->func, dest, temp);
+ }
+ else {
+ struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+ struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+ sse_cvtps2pi(&p->func, mmx0, temp);
+ sse_movhlps(&p->func, temp, temp);
+ sse_cvtps2pi(&p->func, mmx1, temp);
+ mmx_packssdw(&p->func, mmx0, mmx1);
+ mmx_packuswb(&p->func, mmx0, mmx0);
+ mmx_movd(&p->func, dest, mmx0);
+ }
+}
+
+static int get_offset( const void *a, const void *b )
+{
+ return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here. Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+ struct x86_reg srcREG,
+ struct x86_reg vfREG,
+ struct draw_vf_attr *a )
+{
+ struct draw_vertex_fetch *vf = p->vf;
+ struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+ /* Load current a[j].inputptr
+ */
+ x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+ struct x86_reg srcREG,
+ struct x86_reg vfREG,
+ struct draw_vf_attr *a )
+{
+ if (a->inputstride) {
+ struct draw_vertex_fetch *vf = p->vf;
+ struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+ /* add a[j].inputstride (hardcoded value - could just as easily
+ * pull the stride value from memory each time).
+ */
+ x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+
+ /* save new value of a[j].inputptr
+ */
+ x86_mov(&p->func, ptr_to_src, srcREG);
+ }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute
+ *
+ */
+static boolean build_vertex_emit( struct x86_program *p )
+{
+ struct draw_vertex_fetch *vf = p->vf;
+ unsigned j = 0;
+
+ struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+ struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+ struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+ struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
+ struct x86_reg temp = x86_make_reg(file_XMM, 0);
+ uint8_t *fixup, *label;
+
+ /* Push a few regs?
+ */
+ x86_push(&p->func, countEBP);
+ x86_push(&p->func, vfESI);
+
+
+ /* Get vertex count, compare to zero
+ */
+ x86_xor(&p->func, srcECX, srcECX);
+ x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+ x86_cmp(&p->func, countEBP, srcECX);
+ fixup = x86_jcc_forward(&p->func, cc_E);
+
+ /* Initialize destination register.
+ */
+ x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+ /* Move argument 1 (vf) into a reg:
+ */
+ x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
+
+
+ /* always load, needed or not:
+ */
+ sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
+
+ /* Note address for loop jump */
+ label = x86_get_label(&p->func);
+
+ /* Emit code for each of the attributes. Currently routes
+ * everything through SSE registers, even when it might be more
+ * efficient to stick with regular old x86. No optimization or
+ * other tricks - enough new ground to cover here just getting
+ * things working.
+ */
+ while (j < vf->attr_count) {
+ struct draw_vf_attr *a = &vf->attr[j];
+ struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+ /* Now, load an XMM reg from src, perhaps transform, then save.
+ * Could be shortcircuited in specific cases:
+ */
+ switch (a->format) {
+ case DRAW_EMIT_1F:
+ case DRAW_EMIT_1F_CONST:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+ emit_store(p, dest, 1, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+ case DRAW_EMIT_2F:
+ case DRAW_EMIT_2F_CONST:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+ emit_store(p, dest, 2, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+ case DRAW_EMIT_3F:
+ case DRAW_EMIT_3F_CONST:
+ /* Potentially the worst case - hardcode 2+1 copying:
+ */
+ if (0) {
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+ emit_store(p, dest, 3, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ }
+ else {
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+ emit_store(p, dest, 2, temp);
+ if (a->inputsize > 2) {
+ emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+ emit_store(p, x86_make_disp(dest,8), 1, temp);
+ }
+ else {
+ sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+ }
+ update_src_ptr(p, srcECX, vfESI, a);
+ }
+ break;
+ case DRAW_EMIT_4F:
+ case DRAW_EMIT_4F_CONST:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+ emit_store(p, dest, 4, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+ case DRAW_EMIT_3F_XYW:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+ sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+ emit_store(p, dest, 3, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+
+ case DRAW_EMIT_1UB_1F:
+ /* Test for PAD3 + 1UB:
+ */
+ if (j > 0 &&
+ a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+ {
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+ sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+ emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+ update_src_ptr(p, srcECX, vfESI, a);
+ }
+ else {
+ debug_printf("Can't emit 1ub %x %x %d\n",
+ a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+ return FALSE;
+ }
+ break;
+ case DRAW_EMIT_3UB_3F_RGB:
+ case DRAW_EMIT_3UB_3F_BGR:
+ /* Test for 3UB + PAD1:
+ */
+ if (j == vf->attr_count - 1 ||
+ a[1].vertoffset >= a->vertoffset + 4) {
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+ if (a->format == DRAW_EMIT_3UB_3F_BGR)
+ sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+ emit_pack_store_4ub(p, dest, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ }
+ /* Test for 3UB + 1UB:
+ */
+ else if (j < vf->attr_count - 1 &&
+ a[1].format == DRAW_EMIT_1UB_1F &&
+ a[1].vertoffset == a->vertoffset + 3) {
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+ update_src_ptr(p, srcECX, vfESI, a);
+
+ /* Make room for incoming value:
+ */
+ sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+ get_src_ptr(p, srcECX, vfESI, &a[1]);
+ emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+ update_src_ptr(p, srcECX, vfESI, &a[1]);
+
+ /* Rearrange and possibly do BGR conversion:
+ */
+ if (a->format == DRAW_EMIT_3UB_3F_BGR)
+ sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+ else
+ sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+ emit_pack_store_4ub(p, dest, temp);
+ j++; /* NOTE: two attrs consumed */
+ }
+ else {
+ debug_printf("Can't emit 3ub\n");
+ }
+ return FALSE; /* add this later */
+ break;
+
+ case DRAW_EMIT_4UB_4F_RGBA:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+ emit_pack_store_4ub(p, dest, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+ case DRAW_EMIT_4UB_4F_BGRA:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+ sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+ emit_pack_store_4ub(p, dest, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+ case DRAW_EMIT_4UB_4F_ARGB:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+ sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+ emit_pack_store_4ub(p, dest, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+ case DRAW_EMIT_4UB_4F_ABGR:
+ get_src_ptr(p, srcECX, vfESI, a);
+ emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+ sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+ emit_pack_store_4ub(p, dest, temp);
+ update_src_ptr(p, srcECX, vfESI, a);
+ break;
+ default:
+ debug_printf("unknown a[%d].format %d\n", j, a->format);
+ return FALSE; /* catch any new opcodes */
+ }
+
+ /* Increment j by at least 1 - may have been incremented above also:
+ */
+ j++;
+ }
+
+ /* Next vertex:
+ */
+ x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
+
+ /* decr count, loop if not zero
+ */
+ x86_dec(&p->func, countEBP);
+ x86_test(&p->func, countEBP, countEBP);
+ x86_jcc(&p->func, cc_NZ, label);
+
+ /* Exit mmx state?
+ */
+ if (p->func.need_emms)
+ mmx_emms(&p->func);
+
+ /* Land forward jump here:
+ */
+ x86_fixup_fwd_jump(&p->func, fixup);
+
+ /* Pop regs and return
+ */
+ x86_pop(&p->func, x86_get_base_reg(vfESI));
+ x86_pop(&p->func, countEBP);
+ x86_ret(&p->func);
+
+ vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
+ return TRUE;
+}
+
+
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+ struct x86_program p;
+
+ if (!cpu_has_xmm) {
+ vf->codegen_emit = NULL;
+ return;
+ }
+
+ memset(&p, 0, sizeof(p));
+
+ p.vf = vf;
+ p.inputs_safe = 0; /* for now */
+ p.outputs_safe = 1; /* for now */
+ p.have_sse2 = cpu_has_xmm2;
+ p.identity = x86_make_reg(file_XMM, 6);
+ p.chan0 = x86_make_reg(file_XMM, 7);
+
+ x86_init_func(&p.func);
+
+ if (build_vertex_emit(&p)) {
+ draw_vf_register_fastpath( vf, TRUE );
+ }
+ else {
+ /* Note the failure so that we don't keep trying to codegen an
+ * impossible state:
+ */
+ draw_vf_register_fastpath( vf, FALSE );
+ x86_release_func(&p.func);
+ }
+}
+
+#else
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+ /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif