From cd3643698eafa0869a8317b002e5b066de0172e7 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 23 Jan 2008 12:48:41 -0700 Subject: gallium: overhaul usage of vertex_info in draw module. Remove all dependencies on vertex_info, except for draw_vbuf. Drawing stages now strictly operate on post-transformed vertices and don't know anything about hw vertices. Use vertex program output info for two-side/flat/etc stages. Temporarily disable vbuf module in softpipe driver. --- src/mesa/pipe/draw/draw_vertex_shader.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'src/mesa/pipe/draw/draw_vertex_shader.c') diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c index c2e038453e..5ca659dbf5 100644 --- a/src/mesa/pipe/draw/draw_vertex_shader.c +++ b/src/mesa/pipe/draw/draw_vertex_shader.c @@ -38,7 +38,6 @@ #endif #include "draw_private.h" #include "draw_context.h" -#include "draw_vertex.h" #include "x86/rtasm/x86sse.h" #include "pipe/llvm/gallivm.h" @@ -176,7 +175,7 @@ run_vertex_program(struct draw_context *draw, /* Remaining attributes are packed into sequential post-transform * vertex attrib slots. */ - for (slot = 1; slot < draw->vertex_info.num_attribs; slot++) { + for (slot = 1; slot < draw->num_vs_outputs; slot++) { vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; @@ -275,6 +274,8 @@ draw_bind_vertex_shader(struct draw_context *draw, draw_flush(draw); draw->vertex_shader = dvs; + draw->num_vs_outputs = dvs->state->num_outputs; + /* specify the fragment program to interpret/execute */ tgsi_exec_machine_init(&draw->machine, draw->vertex_shader->state->tokens, -- cgit v1.2.3 From 1603a33fb276d7e78a2e872dfa05aa0093d1329a Mon Sep 17 00:00:00 2001 From: Brian Date: Fri, 25 Jan 2008 17:21:05 -0700 Subject: gallium: better flush logic in draw module This is the other half of Keith's draw/flush patch. There are now 5 flush flags to control what's flushed (post-xform vertex cache, prim cache, vbuf, etc). The gears slow-down in this part of the patch was due to the cull stage not getting invoked. It was unconditional before, but is now gated by 'need_det'. But it also needs to be gated by draw->rasterizer->cull_mode. Gears uses back-face culling. --- src/mesa/pipe/draw/draw_context.c | 29 ++++++----- src/mesa/pipe/draw/draw_prim.c | 85 +++++++++++++-------------------- src/mesa/pipe/draw/draw_private.h | 16 +++---- src/mesa/pipe/draw/draw_validate.c | 32 ++++++++----- src/mesa/pipe/draw/draw_vbuf.c | 17 +++---- src/mesa/pipe/draw/draw_vertex_cache.c | 9 ++-- src/mesa/pipe/draw/draw_vertex_shader.c | 4 +- 7 files changed, 89 insertions(+), 103 deletions(-) (limited to 'src/mesa/pipe/draw/draw_vertex_shader.c') diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c index ff23288fa8..e8ca1f035b 100644 --- a/src/mesa/pipe/draw/draw_context.c +++ b/src/mesa/pipe/draw/draw_context.c @@ -80,7 +80,7 @@ struct draw_context *draw_create( void ) draw->convert_wide_points = TRUE; draw->convert_wide_lines = TRUE; - draw->prim = ~0; /* != any of PIPE_PRIM_x */ + draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */ draw_vertex_cache_invalidate( draw ); draw_set_mapped_element_buffer( draw, 0, NULL ); @@ -111,8 +111,7 @@ void draw_destroy( struct draw_context *draw ) void draw_flush( struct draw_context *draw ) { - if (draw->drawing) - draw_do_flush( draw, DRAW_FLUSH_DRAW ); + draw_do_flush( draw, DRAW_FLUSH_BACKEND ); } @@ -124,7 +123,8 @@ void draw_flush( struct draw_context *draw ) void draw_set_rasterizer_state( struct draw_context *draw, const struct pipe_rasterizer_state *raster ) { - draw_flush( draw ); + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + draw->rasterizer = raster; } @@ -137,7 +137,8 @@ void draw_set_rasterizer_state( struct draw_context *draw, void draw_set_rasterize_stage( struct draw_context *draw, struct draw_stage *stage ) { - draw_flush( draw ); + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + draw->pipeline.rasterize = stage; } @@ -148,7 +149,7 @@ void draw_set_rasterize_stage( struct draw_context *draw, void draw_set_clip_state( struct draw_context *draw, const struct pipe_clip_state *clip ) { - draw_flush( draw ); + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); assert(clip->nr <= PIPE_MAX_CLIP_PLANES); memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0])); @@ -162,7 +163,7 @@ void draw_set_clip_state( struct draw_context *draw, void draw_set_viewport_state( struct draw_context *draw, const struct pipe_viewport_state *viewport ) { - draw_flush( draw ); + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); draw->viewport = *viewport; /* struct copy */ } @@ -173,8 +174,7 @@ draw_set_vertex_buffer(struct draw_context *draw, unsigned attr, const struct pipe_vertex_buffer *buffer) { - draw_flush( draw ); - + draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ ); assert(attr < PIPE_ATTRIB_MAX); draw->vertex_buffer[attr] = *buffer; } @@ -185,8 +185,7 @@ draw_set_vertex_element(struct draw_context *draw, unsigned attr, const struct pipe_vertex_element *element) { - draw_flush( draw ); - + draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ ); assert(attr < PIPE_ATTRIB_MAX); draw->vertex_element[attr] = *element; } @@ -199,8 +198,7 @@ void draw_set_mapped_vertex_buffer(struct draw_context *draw, unsigned attr, const void *buffer) { - draw_flush( draw ); - + draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ ); draw->user.vbuffer[attr] = buffer; } @@ -209,8 +207,7 @@ void draw_set_mapped_constant_buffer(struct draw_context *draw, const void *buffer) { - draw_flush( draw ); - + draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ ); draw->user.constants = buffer; } @@ -222,6 +219,7 @@ draw_set_mapped_constant_buffer(struct draw_context *draw, void draw_convert_wide_points(struct draw_context *draw, boolean enable) { + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); draw->convert_wide_points = enable; } @@ -233,6 +231,7 @@ draw_convert_wide_points(struct draw_context *draw, boolean enable) void draw_convert_wide_lines(struct draw_context *draw, boolean enable) { + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); draw->convert_wide_lines = enable; } diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c index 5703f5f0b0..243381aec0 100644 --- a/src/mesa/pipe/draw/draw_prim.c +++ b/src/mesa/pipe/draw/draw_prim.c @@ -57,17 +57,14 @@ static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = { static void draw_prim_queue_flush( struct draw_context *draw ) { - // struct draw_stage *first = draw->pipeline.first; unsigned i; if (0) fprintf(stdout,"Flushing with %d prims, %d verts\n", draw->pq.queue_nr, draw->vs.queue_nr); - /* Make sure all vertices are available/shaded: - */ - if (draw->vs.queue_nr) - draw_vertex_shader_queue_flush(draw); + if (draw->pq.queue_nr == 0) + return; /* NOTE: we cannot save draw->pipeline->first in a local var because * draw->pipeline->first is often changed by the first call to tri(), @@ -102,33 +99,32 @@ static void draw_prim_queue_flush( struct draw_context *draw ) } -void draw_do_flush( struct draw_context *draw, - unsigned flush ) + +void draw_do_flush( struct draw_context *draw, unsigned flags ) { - if ((flush & (DRAW_FLUSH_PRIM_QUEUE | - DRAW_FLUSH_VERTEX_CACHE_INVALIDATE | - DRAW_FLUSH_DRAW)) && - draw->pq.queue_nr) - { - draw_prim_queue_flush(draw); - } + if (0) + fprintf(stdout,"Flushing with %d verts, %d prims\n", + draw->vs.queue_nr, + draw->pq.queue_nr ); - if ((flush & (DRAW_FLUSH_VERTEX_CACHE_INVALIDATE | - DRAW_FLUSH_DRAW)) && - draw->drawing) - { - draw_vertex_cache_invalidate(draw); - } - if ((flush & DRAW_FLUSH_DRAW) && - draw->drawing) - { - draw->pipeline.first->flush( draw->pipeline.first, ~0 ); - draw->drawing = FALSE; - draw->prim = ~0; - draw->pipeline.first = draw->pipeline.validate; - } + if (flags >= DRAW_FLUSH_SHADER_QUEUE) { + draw_vertex_shader_queue_flush(draw); + + if (flags >= DRAW_FLUSH_PRIM_QUEUE) { + draw_prim_queue_flush(draw); + if (flags >= DRAW_FLUSH_VERTEX_CACHE) { + draw_vertex_cache_invalidate(draw); + + if (flags >= DRAW_FLUSH_STATE_CHANGE) { + draw->pipeline.first->flush( draw->pipeline.first, flags ); + draw->pipeline.first = draw->pipeline.validate; + draw->reduced_prim = ~0; + } + } + } + } } @@ -143,7 +139,7 @@ static struct prim_header *get_queued_prim( struct draw_context *draw, { if (!draw_vertex_cache_check_space( draw, nr_verts )) { // fprintf(stderr, "v"); - draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE_INVALIDATE ); + draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE ); } else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) { // fprintf(stderr, "p"); @@ -251,13 +247,14 @@ static void do_quad( struct draw_context *draw, * Main entrypoint to draw some number of points/lines/triangles */ static void -draw_prim( struct draw_context *draw, unsigned start, unsigned count ) +draw_prim( struct draw_context *draw, + unsigned prim, unsigned start, unsigned count ) { unsigned i; // _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count ); - switch (draw->prim) { + switch (prim) { case PIPE_PRIM_POINTS: for (i = 0; i < count; i ++) { do_point( draw, @@ -389,21 +386,6 @@ draw_prim( struct draw_context *draw, unsigned start, unsigned count ) } -static void -draw_set_prim( struct draw_context *draw, unsigned prim ) -{ - assert(prim >= PIPE_PRIM_POINTS); - assert(prim <= PIPE_PRIM_POLYGON); - - if (reduced_prim[prim] != draw->reduced_prim) { - draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE ); - draw->reduced_prim = reduced_prim[prim]; - } - - draw->prim = prim; -} - - /** @@ -417,16 +399,13 @@ void draw_arrays(struct draw_context *draw, unsigned prim, unsigned start, unsigned count) { - if (!draw->drawing) { - draw->drawing = TRUE; - } - - if (draw->prim != prim) { - draw_set_prim( draw, prim ); + if (reduced_prim[prim] != draw->reduced_prim) { + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + draw->reduced_prim = reduced_prim[prim]; } /* drawing done here: */ - draw_prim(draw, start, count); + draw_prim(draw, prim, start, count); } diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h index e393fa5fe2..1c2e88264f 100644 --- a/src/mesa/pipe/draw/draw_private.h +++ b/src/mesa/pipe/draw/draw_private.h @@ -111,7 +111,7 @@ struct draw_stage struct prim_header * ); void (*flush)( struct draw_stage *, - unsigned flags ); + unsigned flags ); void (*reset_stipple_counter)( struct draw_stage * ); @@ -191,8 +191,6 @@ struct draw_context boolean convert_wide_points; /**< convert wide points to tris? */ boolean convert_wide_lines; /**< convert side lines to tris? */ - boolean drawing; /**< do we presently have something queued for drawing? */ - unsigned prim; /**< current prim type: PIPE_PRIM_x */ unsigned reduced_prim; /** TGSI program interpreter runtime state */ @@ -278,14 +276,14 @@ extern void draw_vertex_fetch( struct draw_context *draw, unsigned count ); -#define DRAW_FLUSH_PRIM_QUEUE 0x1 -#define DRAW_FLUSH_VERTEX_CACHE_INVALIDATE 0x2 -#define DRAW_FLUSH_DRAW 0x4 +#define DRAW_FLUSH_SHADER_QUEUE 0x1 /* sized not to overflow, never raised */ +#define DRAW_FLUSH_PRIM_QUEUE 0x2 +#define DRAW_FLUSH_VERTEX_CACHE 0x4 +#define DRAW_FLUSH_STATE_CHANGE 0x8 +#define DRAW_FLUSH_BACKEND 0x10 -void draw_do_flush( struct draw_context *draw, - unsigned flags ); - +void draw_do_flush( struct draw_context *draw, unsigned flags ); diff --git a/src/mesa/pipe/draw/draw_validate.c b/src/mesa/pipe/draw/draw_validate.c index a626fb1fba..86d5a5f814 100644 --- a/src/mesa/pipe/draw/draw_validate.c +++ b/src/mesa/pipe/draw/draw_validate.c @@ -43,6 +43,13 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) { struct draw_context *draw = stage->draw; struct draw_stage *next = draw->pipeline.rasterize; + int need_det = 0; + int precalc_flat = 0; + + /* Set the validate's next stage to the rasterize stage, so that it + * can be found later if needed for flushing. + */ + stage->next = next; /* * NOTE: we build up the pipeline in end-to-start order. @@ -61,29 +68,38 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) if (draw->rasterizer->line_stipple_enable) { draw->pipeline.stipple->next = next; next = draw->pipeline.stipple; + precalc_flat = 1; /* only needed for lines really */ } if (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL || draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) { draw->pipeline.unfilled->next = next; next = draw->pipeline.unfilled; + precalc_flat = 1; /* only needed for triangles really */ + need_det = 1; } if (draw->rasterizer->offset_cw || draw->rasterizer->offset_ccw) { draw->pipeline.offset->next = next; next = draw->pipeline.offset; + need_det = 1; } if (draw->rasterizer->light_twoside) { draw->pipeline.twoside->next = next; next = draw->pipeline.twoside; + need_det = 1; } /* Always run the cull stage as we calculate determinant there - * also. Fix this.. + * also. + * + * This can actually be a win as culling out the triangles can lead + * to less work emitting vertices, smaller vertex buffers, etc. + * It's difficult to say whether this will be true in general. */ - { + if (need_det || draw->rasterizer->cull_mode) { draw->pipeline.cull->next = next; next = draw->pipeline.cull; } @@ -94,23 +110,18 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) { draw->pipeline.clip->next = next; next = draw->pipeline.clip; + precalc_flat = 1; /* XXX: FIX ME! Only needed for clipped prims */ } - /* Do software flatshading prior to clipping. XXX: should only do - * this for clipped primitives, ie it is a part of the clip - * routine. - */ - if (draw->rasterizer->flatshade) { + if (draw->rasterizer->flatshade && precalc_flat) { draw->pipeline.flatshade->next = next; next = draw->pipeline.flatshade; } - + draw->pipeline.first = next; - //BP draw->pipeline.first->begin( draw->pipeline.first ); return next; } - static void validate_tri( struct draw_stage *stage, struct prim_header *header ) { @@ -162,7 +173,6 @@ struct draw_stage *draw_validate_stage( struct draw_context *draw ) struct draw_stage *stage = CALLOC_STRUCT(draw_stage); stage->draw = draw; - stage->next = NULL; stage->point = validate_point; stage->line = validate_line; diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c index d827f51d56..cd0b4fbbb9 100644 --- a/src/mesa/pipe/draw/draw_vbuf.c +++ b/src/mesa/pipe/draw/draw_vbuf.c @@ -387,29 +387,26 @@ vbuf_alloc_vertices( struct draw_stage *stage, } -static void -vbuf_begin( struct draw_stage *stage ) -{ - /* no-op, vbuffer allocated by first point/line/tri */ -} - static void vbuf_flush( struct draw_stage *stage, unsigned flags ) { -// vbuf_flush_indices( stage ); - /* XXX: Overkill */ - vbuf_flush_vertices( stage ); - + vbuf_flush_indices( stage ); + stage->point = vbuf_first_point; stage->line = vbuf_first_line; stage->tri = vbuf_first_tri; + + if (flags & DRAW_FLUSH_BACKEND) + vbuf_flush_vertices( stage ); } static void vbuf_reset_stipple_counter( struct draw_stage *stage ) { + /* XXX: Need to do something here for hardware with linestipple. + */ (void) stage; } diff --git a/src/mesa/pipe/draw/draw_vertex_cache.c b/src/mesa/pipe/draw/draw_vertex_cache.c index 97a40b876e..b4b4906d70 100644 --- a/src/mesa/pipe/draw/draw_vertex_cache.c +++ b/src/mesa/pipe/draw/draw_vertex_cache.c @@ -42,10 +42,13 @@ void draw_vertex_cache_invalidate( struct draw_context *draw ) assert(draw->pq.queue_nr == 0); assert(draw->vs.queue_nr == 0); assert(draw->vcache.referenced == 0); - + /* XXX memset() here */ +#if 0 for (i = 0; i < Elements( draw->vcache.idx ); i++) draw->vcache.idx[i] = ~0; - +#else + memset(draw->vcache.idx, ~0, sizeof(draw->vcache.idx)); +#endif // fprintf(stderr, "x\n"); } @@ -148,7 +151,7 @@ void draw_vertex_cache_unreference( struct draw_context *draw ) int draw_vertex_cache_check_space( struct draw_context *draw, - unsigned nr_verts ) + unsigned nr_verts ) { if (draw->vcache.overflow + nr_verts < VCACHE_OVERFLOW) { /* The vs queue is sized so that this can never happen: diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c index 5ca659dbf5..d19b60198d 100644 --- a/src/mesa/pipe/draw/draw_vertex_shader.c +++ b/src/mesa/pipe/draw/draw_vertex_shader.c @@ -271,9 +271,9 @@ void draw_bind_vertex_shader(struct draw_context *draw, struct draw_vertex_shader *dvs) { - draw_flush(draw); - draw->vertex_shader = dvs; + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + draw->vertex_shader = dvs; draw->num_vs_outputs = dvs->state->num_outputs; /* specify the fragment program to interpret/execute */ -- cgit v1.2.3 From 027983f5850afea753381be454122166c6d56777 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 24 Jan 2008 11:19:06 +0000 Subject: gallium: restructure vertex fetch code slightly --- src/mesa/pipe/draw/draw_private.h | 17 +++ src/mesa/pipe/draw/draw_vertex_fetch.c | 193 +++++++++++++++++++++----------- src/mesa/pipe/draw/draw_vertex_shader.c | 4 + 3 files changed, 149 insertions(+), 65 deletions(-) (limited to 'src/mesa/pipe/draw/draw_vertex_shader.c') diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h index 1c2e88264f..1e59f5bd8d 100644 --- a/src/mesa/pipe/draw/draw_private.h +++ b/src/mesa/pipe/draw/draw_private.h @@ -137,6 +137,13 @@ struct draw_vertex_shader { #endif }; + +/* Internal function for vertex fetch. + */ +typedef void (*fetch_func)(const void *ptr, float *attrib); + + + /** * Private context for the drawing module. */ @@ -196,6 +203,15 @@ struct draw_context /** TGSI program interpreter runtime state */ struct tgsi_exec_machine machine; + /* Vertex fetch internal state + */ + struct { + const ubyte *src_ptr[PIPE_ATTRIB_MAX]; + unsigned pitch[PIPE_ATTRIB_MAX]; + fetch_func fetch[PIPE_ATTRIB_MAX]; + unsigned nr_attrs; + } vertex_fetch; + /* Post-tnl vertex cache: */ struct { @@ -270,6 +286,7 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw ); struct tgsi_exec_machine; +extern void draw_update_vertex_fetch( struct draw_context *draw ); extern void draw_vertex_fetch( struct draw_context *draw, struct tgsi_exec_machine *machine, const unsigned *elts, diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c index 4d64d3d4f2..23e8187899 100644 --- a/src/mesa/pipe/draw/draw_vertex_fetch.c +++ b/src/mesa/pipe/draw/draw_vertex_fetch.c @@ -42,53 +42,108 @@ /** * Fetch a float[4] vertex attribute from memory, doing format/type * conversion as needed. - * XXX this might be a temporary thing. + * + * This is probably needed/dupliocated elsewhere, eg format + * conversion, texture sampling etc. */ -static void -fetch_attrib4(const void *ptr, enum pipe_format format, float attrib[4]) +#define FETCH_ATTRIB( NAME, SZ, CVT ) \ +static void \ +fetch_##NAME(const void *ptr, float *attrib) \ +{ \ + static const float defaults[4] = { 0,0,0,1 }; \ + int i; \ + \ + for (i = 0; i < SZ; i++) { \ + attrib[i] = CVT; \ + } \ + \ + for (; i < 4; i++) { \ + attrib[i] = defaults[i]; \ + } \ +} + +#define CVT_32_FLOAT ((float *) ptr)[i] +#define CVT_32_SSCALED (float) ((int *) ptr)[i] +#define CVT_8_UNORM (float) ((unsigned char *) ptr)[i] / 255.0f + +FETCH_ATTRIB( R32G32B32A32_FLOAT, 4, CVT_32_FLOAT ) +FETCH_ATTRIB( R32G32B32_FLOAT, 3, CVT_32_FLOAT ) +FETCH_ATTRIB( R32G32_FLOAT, 2, CVT_32_FLOAT ) +FETCH_ATTRIB( R32_FLOAT, 1, CVT_32_FLOAT ) +FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED ) +FETCH_ATTRIB( R32G32B32_SSCALED, 3, CVT_32_SSCALED ) +FETCH_ATTRIB( R32G32_SSCALED, 2, CVT_32_SSCALED ) +FETCH_ATTRIB( R32_SSCALED, 1, CVT_32_SSCALED ) +FETCH_ATTRIB( A8R8G8B8_UNORM, 4, CVT_8_UNORM ) +FETCH_ATTRIB( R8G8B8A8_UNORM, 4, CVT_8_UNORM ) + + + +static fetch_func get_fetch_func( unsigned format ) { - /* defaults */ - attrib[1] = 0.0; - attrib[2] = 0.0; - attrib[3] = 1.0; switch (format) { case PIPE_FORMAT_R32G32B32A32_FLOAT: - attrib[3] = ((float *) ptr)[3]; - /* fall-through */ + return fetch_R32G32B32A32_FLOAT; case PIPE_FORMAT_R32G32B32_FLOAT: - attrib[2] = ((float *) ptr)[2]; - /* fall-through */ + return fetch_R32G32B32_FLOAT; case PIPE_FORMAT_R32G32_FLOAT: - attrib[1] = ((float *) ptr)[1]; - /* fall-through */ + return fetch_R32G32_FLOAT; case PIPE_FORMAT_R32_FLOAT: - attrib[0] = ((float *) ptr)[0]; - break; - + return fetch_R32_FLOAT; case PIPE_FORMAT_R32G32B32A32_SSCALED: - attrib[3] = (float) ((int *) ptr)[3]; - /* fall-through */ + return fetch_R32G32B32A32_SSCALED; case PIPE_FORMAT_R32G32B32_SSCALED: - attrib[2] = (float) ((int *) ptr)[2]; - /* fall-through */ + return fetch_R32G32B32_SSCALED; case PIPE_FORMAT_R32G32_SSCALED: - attrib[1] = (float) ((int *) ptr)[1]; - /* fall-through */ + return fetch_R32G32_SSCALED; case PIPE_FORMAT_R32_SSCALED: - attrib[0] = (float) ((int *) ptr)[0]; - break; - + return fetch_R32_SSCALED; case PIPE_FORMAT_A8R8G8B8_UNORM: + return fetch_A8R8G8B8_UNORM; case PIPE_FORMAT_R8G8B8A8_UNORM: - attrib[0] = (float) ((unsigned char *) ptr)[2] / 255.0f; - attrib[1] = (float) ((unsigned char *) ptr)[1] / 255.0f; - attrib[2] = (float) ((unsigned char *) ptr)[0] / 255.0f; - attrib[3] = (float) ((unsigned char *) ptr)[3] / 255.0f; - break; - + return fetch_R8G8B8A8_UNORM; default: + /* Lots of missing cases! */ assert(0); + return NULL; + } +} + + +static void +transpose_4x4( float *out, const float *in ) +{ + /* This can be achieved in 12 sse instructions, plus the final + * stores I guess. This is probably a bit more than that - maybe + * 32 or so? + */ + out[0] = in[0]; out[1] = in[4]; out[2] = in[8]; out[3] = in[12]; + out[4] = in[1]; out[5] = in[5]; out[6] = in[9]; out[7] = in[13]; + out[8] = in[2]; out[9] = in[6]; out[10] = in[10]; out[11] = in[14]; + out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15]; +} + + + +void draw_update_vertex_fetch( struct draw_context *draw ) +{ + //unsigned nr_attrs = draw->vertex_element_count; + unsigned nr_attrs = draw->vertex_shader->state->num_inputs; + unsigned i; + + for (i = 0; i < nr_attrs; i++) { + unsigned buf = draw->vertex_element[i].vertex_buffer_index; + unsigned format = draw->vertex_element[i].src_format; + + draw->vertex_fetch.src_ptr[i] = (const ubyte *) (draw->user.vbuffer[buf] + + draw->vertex_buffer[buf].buffer_offset + + draw->vertex_element[i].src_offset ); + + draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch; + draw->vertex_fetch.fetch[i] = get_fetch_func( format ); } + + draw->vertex_fetch.nr_attrs = nr_attrs; } @@ -100,40 +155,48 @@ void draw_vertex_fetch( struct draw_context *draw, const unsigned *elts, unsigned count ) { - unsigned j; - - /* loop over vertices */ - for (j = 0; j < count; j++) { - uint attr; - -#if DRAW_DBG - printf("fetch vertex %u: \n", j); -#endif - - /* loop over vertex attributes (vertex shader inputs) */ - for (attr = 0; attr < draw->vertex_shader->state->num_inputs; attr++) { - - unsigned buf = draw->vertex_element[attr].vertex_buffer_index; - const void *src - = (const void *) ((const ubyte *) draw->user.vbuffer[buf] - + draw->vertex_buffer[buf].buffer_offset - + draw->vertex_element[attr].src_offset - + elts[j] * draw->vertex_buffer[buf].pitch); - float p[4]; - - fetch_attrib4(src, draw->vertex_element[attr].src_format, p); - -#if DRAW_DBG - printf(" %u: %f %f %f %f\n", attr, p[0], p[1], p[2], p[3]); -#endif - - /* Transform to AoS xxxx/yyyy/zzzz/wwww representation: - */ - machine->Inputs[attr].xyzw[0].f[j] = p[0]; /*X*/ - machine->Inputs[attr].xyzw[1].f[j] = p[1]; /*Y*/ - machine->Inputs[attr].xyzw[2].f[j] = p[2]; /*Z*/ - machine->Inputs[attr].xyzw[3].f[j] = p[3]; /*W*/ - } + unsigned nr_attrs = draw->vertex_fetch.nr_attrs; + unsigned attr; + + assert(count <= 4); + +// _mesa_printf("%s %d\n", __FUNCTION__, count); + + /* loop over vertex attributes (vertex shader inputs) + */ + for (attr = 0; attr < nr_attrs; attr++) { + + const unsigned pitch = draw->vertex_fetch.pitch[attr]; + const ubyte *src = draw->vertex_fetch.src_ptr[attr]; + const fetch_func fetch = draw->vertex_fetch.fetch[attr]; + unsigned i; + float p[4][4]; + + + /* Fetch four attributes for four vertices. + * + * Could fetch directly into AOS format, but this is meant to be + * a prototype for an sse implementation, which would have + * difficulties doing that. + */ + for (i = 0; i < count; i++) + fetch( src + elts[i] * pitch, p[i] ); + + /* Be nice and zero out any missing vertices: + */ + for ( ; i < 4; i++) + p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0; + + /* Transpose/swizzle into sse-friendly format. Currently + * assuming that all vertex shader inputs are float[4], but this + * isn't true -- if the vertex shader only wants tex0.xy, we + * could optimize for that. + * + * To do so fully without codegen would probably require an + * excessive number of fetch functions, but we could at least + * minimize the transpose step: + */ + transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p ); } } diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c index d19b60198d..3041974b9a 100644 --- a/src/mesa/pipe/draw/draw_vertex_shader.c +++ b/src/mesa/pipe/draw/draw_vertex_shader.c @@ -201,6 +201,10 @@ draw_vertex_shader_queue_flush(struct draw_context *draw) { unsigned i, j; + /* XXX: do this on statechange: + */ + draw_update_vertex_fetch( draw ); + // fprintf(stderr, " q(%d) ", draw->vs.queue_nr ); #ifdef MESA_LLVM if (draw->vertex_shader->llvm_prog) { -- cgit v1.2.3