From 8b25b5256fad23e8ea11c6931ecac658ca60c0b0 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 13 May 2008 09:46:53 +0100 Subject: draw: remove disabled non-sse swizzle code --- src/gallium/auxiliary/draw/draw_vs_sse.c | 50 +------------------------------- 1 file changed, 1 insertion(+), 49 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_sse.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index e3f4e67472..edf235cddc 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -47,9 +47,7 @@ #include "tgsi/util/tgsi_parse.h" #define SSE_MAX_VERTICES 4 -#define SSE_SWIZZLES 1 -#if SSE_SWIZZLES typedef void (XSTDCALL *codegen_function) ( const struct tgsi_exec_vector *input, /* 1 */ struct tgsi_exec_vector *output, /* 2 */ @@ -62,14 +60,6 @@ typedef void (XSTDCALL *codegen_function) ( float (*aos_output)[4], /* 9 */ uint num_outputs, /* 10 */ uint output_stride ); /* 11 */ -#else -typedef void (XSTDCALL *codegen_function) ( - const struct tgsi_exec_vector *input, - struct tgsi_exec_vector *output, - float (*constant)[4], - struct tgsi_exec_vector *temporary, - float (*immediates)[4] ); -#endif struct draw_sse_vertex_shader { struct draw_vertex_shader base; @@ -111,7 +101,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base, for (i = 0; i < count; i += MAX_TGSI_VERTICES) { unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i); -#if SSE_SWIZZLES /* run compiled shader */ shader->func(machine->Inputs, @@ -128,43 +117,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base, input = (const float (*)[4])((const char *)input + input_stride * max_vertices); output = (float (*)[4])((char *)output + output_stride * max_vertices); -#else - unsigned int j, slot; - - /* Swizzle inputs. - */ - for (j = 0; j < max_vertices; j++) { - for (slot = 0; slot < base->info.num_inputs; slot++) { - machine->Inputs[slot].xyzw[0].f[j] = input[slot][0]; - machine->Inputs[slot].xyzw[1].f[j] = input[slot][1]; - machine->Inputs[slot].xyzw[2].f[j] = input[slot][2]; - machine->Inputs[slot].xyzw[3].f[j] = input[slot][3]; - } - - input = (const float (*)[4])((const char *)input + input_stride); - } - - /* run compiled shader - */ - shader->func(machine->Inputs, - machine->Outputs, - (float (*)[4])constants, - machine->Temps, - shader->immediates); - - /* Unswizzle all output results. - */ - for (j = 0; j < max_vertices; j++) { - for (slot = 0; slot < base->info.num_outputs; slot++) { - output[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; - output[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; - output[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; - output[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; - } - - output = (float (*)[4])((char *)output + output_stride); - } -#endif } } @@ -211,7 +163,7 @@ draw_create_vs_sse(struct draw_context *draw, x86_init_func( &vs->sse2_program ); if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens, - &vs->sse2_program, vs->immediates, SSE_SWIZZLES )) + &vs->sse2_program, vs->immediates, TRUE )) goto fail; vs->func = (codegen_function) x86_get_func( &vs->sse2_program ); -- cgit v1.2.3 From 2f0d1396e4c1626b3b1ac799bd29e86a9530369e Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 13 May 2008 13:40:22 +0100 Subject: draw: move some state into a new 'vs' area --- src/gallium/auxiliary/draw/draw_context.c | 21 ++++--------- src/gallium/auxiliary/draw/draw_pipe.h | 2 +- src/gallium/auxiliary/draw/draw_pipe_aaline.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_aapoint.c | 4 +-- src/gallium/auxiliary/draw/draw_pipe_clip.c | 6 ++-- src/gallium/auxiliary/draw/draw_pipe_flatshade.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_stipple.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_twoside.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_wide_point.c | 4 +-- src/gallium/auxiliary/draw/draw_private.h | 28 +++++++++++------ .../auxiliary/draw/draw_pt_fetch_shade_pipeline.c | 6 ++-- src/gallium/auxiliary/draw/draw_pt_middle_fse.c | 4 +-- src/gallium/auxiliary/draw/draw_vs.c | 35 +++++++++++++++++++--- src/gallium/auxiliary/draw/draw_vs_exec.c | 2 +- src/gallium/auxiliary/draw/draw_vs_sse.c | 2 +- 15 files changed, 75 insertions(+), 47 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_sse.c') diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 98e23fa830..2242074965 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -56,12 +56,6 @@ struct draw_context *draw_create( void ) draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */ - tgsi_exec_machine_init(&draw->machine); - - /* FIXME: give this machine thing a proper constructor: - */ - draw->machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); - draw->machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); if (!draw_pipeline_init( draw )) goto fail; @@ -69,6 +63,9 @@ struct draw_context *draw_create( void ) if (!draw_pt_init( draw )) goto fail; + if (!draw_vs_init( draw )) + goto fail; + return draw; fail: @@ -83,13 +80,6 @@ void draw_destroy( struct draw_context *draw ) return; - if (draw->machine.Inputs) - align_free(draw->machine.Inputs); - - if (draw->machine.Outputs) - align_free(draw->machine.Outputs); - - tgsi_exec_machine_free_data(&draw->machine); /* Not so fast -- we're just borrowing this at the moment. * @@ -99,6 +89,7 @@ void draw_destroy( struct draw_context *draw ) draw_pipeline_destroy( draw ); draw_pt_destroy( draw ); + draw_vs_destroy( draw ); FREE( draw ); } @@ -295,7 +286,7 @@ int draw_find_vs_output(struct draw_context *draw, uint semantic_name, uint semantic_index) { - const struct draw_vertex_shader *vs = draw->vertex_shader; + const struct draw_vertex_shader *vs = draw->vs.vertex_shader; uint i; for (i = 0; i < vs->info.num_outputs; i++) { if (vs->info.output_semantic_name[i] == semantic_name && @@ -320,7 +311,7 @@ draw_find_vs_output(struct draw_context *draw, uint draw_num_vs_outputs(struct draw_context *draw) { - uint count = draw->vertex_shader->info.num_outputs; + uint count = draw->vs.vertex_shader->info.num_outputs; if (draw->extra_vp_outputs.slot > 0) count++; return count; diff --git a/src/gallium/auxiliary/draw/draw_pipe.h b/src/gallium/auxiliary/draw/draw_pipe.h index f1cb0891ca..dbad8f98ac 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.h +++ b/src/gallium/auxiliary/draw/draw_pipe.h @@ -116,7 +116,7 @@ dup_vert( struct draw_stage *stage, { struct vertex_header *tmp = stage->tmp[idx]; const uint vsize = sizeof(struct vertex_header) - + stage->draw->num_vs_outputs * 4 * sizeof(float); + + stage->draw->vs.num_vs_outputs * 4 * sizeof(float); memcpy(tmp, vert, vsize); tmp->vertex_id = UNDEFINED_VERTEX_ID; return tmp; diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index f501b2aed4..d93708ad3c 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -651,7 +651,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header) } /* update vertex attrib info */ - aaline->tex_slot = draw->num_vs_outputs; + aaline->tex_slot = draw->vs.num_vs_outputs; assert(aaline->tex_slot > 0); /* output[0] is vertex pos */ /* advertise the extra post-transformed vertex attribute */ diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c index 122a48660a..97d74ad693 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c @@ -681,7 +681,7 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header) bind_aapoint_fragment_shader(aapoint); /* update vertex attrib info */ - aapoint->tex_slot = draw->num_vs_outputs; + aapoint->tex_slot = draw->vs.num_vs_outputs; assert(aapoint->tex_slot > 0); /* output[0] is vertex pos */ draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC; @@ -692,7 +692,7 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header) aapoint->psize_slot = -1; if (draw->rasterizer->point_size_per_vertex) { /* find PSIZ vertex output */ - const struct draw_vertex_shader *vs = draw->vertex_shader; + const struct draw_vertex_shader *vs = draw->vs.vertex_shader; uint i; for (i = 0; i < vs->info.num_outputs; i++) { if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) { diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index ce80c94163..c11ed934a4 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -112,7 +112,7 @@ static void interp( const struct clipper *clip, const struct vertex_header *out, const struct vertex_header *in ) { - const unsigned nr_attrs = clip->stage.draw->num_vs_outputs; + const unsigned nr_attrs = clip->stage.draw->vs.num_vs_outputs; unsigned j; /* Vertex header. @@ -180,7 +180,7 @@ static void emit_poly( struct draw_stage *stage, header.flags |= edge_last; if (0) { - const struct draw_vertex_shader *vs = stage->draw->vertex_shader; + const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader; uint j, k; debug_printf("Clipped tri:\n"); for (j = 0; j < 3; j++) { @@ -425,7 +425,7 @@ clip_init_state( struct draw_stage *stage ) clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE; if (clipper->flat) { - const struct draw_vertex_shader *vs = stage->draw->vertex_shader; + const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader; uint i; clipper->num_color_attribs = 0; diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c index 09b68c4559..21a9c3b77f 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c +++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c @@ -159,7 +159,7 @@ static void flatshade_line_1( struct draw_stage *stage, static void flatshade_init_state( struct draw_stage *stage ) { struct flat_stage *flat = flat_stage(stage); - const struct draw_vertex_shader *vs = stage->draw->vertex_shader; + const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader; uint i; /* Find which vertex shader outputs are colors, make a list */ diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index 3cbced362e..4673d5dcba 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -71,7 +71,7 @@ screen_interp( struct draw_context *draw, const struct vertex_header *v1 ) { uint attr; - for (attr = 0; attr < draw->num_vs_outputs; attr++) { + for (attr = 0; attr < draw->vs.num_vs_outputs; attr++) { const float *val0 = v0->data[attr]; const float *val1 = v1->data[attr]; float *newv = dst->data[attr]; diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c index 50872fdbe9..3ac825f565 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c +++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c @@ -105,7 +105,7 @@ static void twoside_first_tri( struct draw_stage *stage, struct prim_header *header ) { struct twoside_stage *twoside = twoside_stage(stage); - const struct draw_vertex_shader *vs = stage->draw->vertex_shader; + const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader; uint i; twoside->attrib_front0 = 0; diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c index ed08573382..df92e3f2d0 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c +++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c @@ -197,7 +197,7 @@ static void widepoint_first_point( struct draw_stage *stage, if (draw->rasterizer->point_sprite) { /* find vertex shader texcoord outputs */ - const struct draw_vertex_shader *vs = draw->vertex_shader; + const struct draw_vertex_shader *vs = draw->vs.vertex_shader; uint i, j = 0; for (i = 0; i < vs->info.num_outputs; i++) { if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) { @@ -212,7 +212,7 @@ static void widepoint_first_point( struct draw_stage *stage, wide->psize_slot = -1; if (draw->rasterizer->point_size_per_vertex) { /* find PSIZ vertex output */ - const struct draw_vertex_shader *vs = draw->vertex_shader; + const struct draw_vertex_shader *vs = draw->vs.vertex_shader; uint i; for (i = 0; i < vs->info.num_outputs; i++) { if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) { diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index fd51a57781..3418ee2b88 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -169,13 +169,24 @@ struct draw_context /* pipe state that we need: */ const struct pipe_rasterizer_state *rasterizer; struct pipe_viewport_state viewport; + boolean identity_viewport; - struct draw_vertex_shader *vertex_shader; + struct { + struct draw_vertex_shader *vertex_shader; + uint num_vs_outputs; /**< convenience, from vertex_shader */ - boolean identity_viewport; - uint num_vs_outputs; /**< convenience, from vertex_shader */ + /** TGSI program interpreter runtime state */ + struct tgsi_exec_machine machine; + + /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private. + */ + struct gallivm_cpu_engine *engine; + + struct translate_cache *fetch_cache; + struct translate_cache *emit_cache; + } vs; /* Clip derived state: */ @@ -192,16 +203,15 @@ struct draw_context unsigned reduced_prim; - /** TGSI program interpreter runtime state */ - struct tgsi_exec_machine machine; - - /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private. - */ - struct gallivm_cpu_engine *engine; void *driver_private; }; +/******************************************************************************* + * Vertex shader code: + */ +boolean draw_vs_init( struct draw_context *draw ); +void draw_vs_destroy( struct draw_context *draw ); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index dad54690a5..06718779a5 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -55,7 +55,7 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_context *draw = fpme->draw; - struct draw_vertex_shader *vs = draw->vertex_shader; + struct draw_vertex_shader *vs = draw->vs.vertex_shader; /* Add one to num_outputs because the pipeline occasionally tags on * an additional texcoord, eg for AA lines. @@ -107,7 +107,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_context *draw = fpme->draw; - struct draw_vertex_shader *shader = draw->vertex_shader; + struct draw_vertex_shader *shader = draw->vs.vertex_shader; unsigned opt = fpme->opt; unsigned alloc_count = align_int( fetch_count, 4 ); @@ -183,7 +183,7 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_context *draw = fpme->draw; - struct draw_vertex_shader *shader = draw->vertex_shader; + struct draw_vertex_shader *shader = draw->vs.vertex_shader; unsigned opt = fpme->opt; unsigned alloc_count = align_int( count, 4 ); diff --git a/src/gallium/auxiliary/draw/draw_pt_middle_fse.c b/src/gallium/auxiliary/draw/draw_pt_middle_fse.c index cdb7d260da..643ea151c1 100644 --- a/src/gallium/auxiliary/draw/draw_pt_middle_fse.c +++ b/src/gallium/auxiliary/draw/draw_pt_middle_fse.c @@ -368,8 +368,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; - unsigned num_vs_inputs = draw->vertex_shader->info.num_inputs; - unsigned num_vs_outputs = draw->vertex_shader->info.num_outputs; + unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs; + unsigned num_vs_outputs = draw->vs.vertex_shader->info.num_outputs; const struct vertex_info *vinfo; unsigned i; boolean need_psize = 0; diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index 03fe00a951..4142dd9589 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -66,13 +66,13 @@ draw_bind_vertex_shader(struct draw_context *draw, if (dvs) { - draw->vertex_shader = dvs; - draw->num_vs_outputs = dvs->info.num_outputs; + draw->vs.vertex_shader = dvs; + draw->vs.num_vs_outputs = dvs->info.num_outputs; dvs->prepare( dvs, draw ); } else { - draw->vertex_shader = NULL; - draw->num_vs_outputs = 0; + draw->vs.vertex_shader = NULL; + draw->vs.num_vs_outputs = 0; } } @@ -83,3 +83,30 @@ draw_delete_vertex_shader(struct draw_context *draw, { dvs->delete( dvs ); } + + + +boolean +draw_vs_init( struct draw_context *draw ) +{ + tgsi_exec_machine_init(&draw->vs.machine); + /* FIXME: give this machine thing a proper constructor: + */ + draw->vs.machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); + draw->vs.machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); + + return TRUE; +} + +void +draw_vs_destroy( struct draw_context *draw ) +{ + if (draw->vs.machine.Inputs) + align_free(draw->vs.machine.Inputs); + + if (draw->vs.machine.Outputs) + align_free(draw->vs.machine.Outputs); + + tgsi_exec_machine_free_data(&draw->vs.machine); + +} diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index 7a02f6334b..cb80d008cd 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -182,7 +182,7 @@ draw_create_vs_exec(struct draw_context *draw, vs->base.prepare = vs_exec_prepare; vs->base.run_linear = vs_exec_run_linear; vs->base.delete = vs_exec_delete; - vs->machine = &draw->machine; + vs->machine = &draw->vs.machine; return &vs->base; } diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index edf235cddc..13ad032bd3 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -158,7 +158,7 @@ draw_create_vs_sse(struct draw_context *draw, vs->base.prepare = vs_sse_prepare; vs->base.run_linear = vs_sse_run_linear; vs->base.delete = vs_sse_delete; - vs->machine = &draw->machine; + vs->machine = &draw->vs.machine; x86_init_func( &vs->sse2_program ); -- cgit v1.2.3 From 7c99d7fe60e7bb0b7cf103a851aeef4614278ca6 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 15 May 2008 12:39:08 +0100 Subject: draw: create specialized vs varients incorporating fetch & emit --- src/gallium/auxiliary/draw/Makefile | 3 +- src/gallium/auxiliary/draw/draw_private.h | 2 + .../auxiliary/draw/draw_pt_fetch_shade_emit.c | 338 +++++++++++++++++++++ src/gallium/auxiliary/draw/draw_vs.c | 83 ++++- src/gallium/auxiliary/draw/draw_vs.h | 105 +++++++ src/gallium/auxiliary/draw/draw_vs_exec.c | 2 + src/gallium/auxiliary/draw/draw_vs_llvm.c | 2 + src/gallium/auxiliary/draw/draw_vs_sse.c | 2 + src/gallium/auxiliary/draw/draw_vs_varient.c | 229 ++++++++++++++ 9 files changed, 764 insertions(+), 2 deletions(-) create mode 100644 src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c create mode 100644 src/gallium/auxiliary/draw/draw_vs_varient.c (limited to 'src/gallium/auxiliary/draw/draw_vs_sse.c') diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile index 3053682da8..84877994fb 100644 --- a/src/gallium/auxiliary/draw/Makefile +++ b/src/gallium/auxiliary/draw/Makefile @@ -26,7 +26,7 @@ C_SOURCES = \ draw_pt_emit.c \ draw_pt_fetch.c \ draw_pt_fetch_emit.c \ - draw_pt_middle_fse.c \ + draw_pt_fetch_shade_emit.c \ draw_pt_fetch_shade_pipeline.c \ draw_pt_post_vs.c \ draw_pt_util.c \ @@ -34,6 +34,7 @@ C_SOURCES = \ draw_pt_vcache.c \ draw_vertex.c \ draw_vs.c \ + draw_vs_varient.c \ draw_vs_exec.c \ draw_vs_llvm.c \ draw_vs_sse.c diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 3418ee2b88..c095bf3d7b 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -184,7 +184,9 @@ struct draw_context struct gallivm_cpu_engine *engine; + struct translate *fetch; struct translate_cache *fetch_cache; + struct translate *emit; struct translate_cache *emit_cache; } vs; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c new file mode 100644 index 0000000000..74945dcfe9 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -0,0 +1,338 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell + */ + + +#include "pipe/p_util.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_vbuf.h" +#include "draw/draw_vertex.h" +#include "draw/draw_pt.h" +#include "draw/draw_vs.h" + +#include "translate/translate.h" + +struct fetch_shade_emit; + + +/* Prototype fetch, shade, emit-hw-verts all in one go. + */ +struct fetch_shade_emit { + struct draw_pt_middle_end base; + struct draw_context *draw; + + + /* Temporaries: + */ + const float *constants; + unsigned pitch[PIPE_MAX_ATTRIBS]; + const ubyte *src[PIPE_MAX_ATTRIBS]; + unsigned prim; + + struct draw_vs_varient_key key; + struct draw_vs_varient *active; +}; + + + + +static void fse_prepare( struct draw_pt_middle_end *middle, + unsigned prim, + unsigned opt ) +{ + struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; + struct draw_context *draw = fse->draw; + unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs; + unsigned num_vs_outputs = draw->vs.vertex_shader->info.num_outputs; + const struct vertex_info *vinfo; + unsigned i; + boolean need_psize = 0; + + + if (draw->pt.user.elts) { + assert(0); + return ; + } + + if (!draw->render->set_primitive( draw->render, + prim )) { + assert(0); + return; + } + + /* Must do this after set_primitive() above: + */ + vinfo = draw->render->get_vertex_info(draw->render); + + + + fse->key.nr_elements = MAX2(num_vs_outputs, /* outputs - translate to hw format */ + num_vs_inputs); /* inputs - fetch from api format */ + + fse->key.output_stride = vinfo->size * 4; + memset(fse->key.element, 0, + fse->key.nr_elements * sizeof(fse->key.element[0])); + + for (i = 0; i < num_vs_inputs; i++) { + const struct pipe_vertex_element *src = &draw->pt.vertex_element[i]; + fse->key.element[i].in.format = src->src_format; + + /* Consider ignoring these, ie make generated programs + * independent of this state: + */ + fse->key.element[i].in.buffer = src->vertex_buffer_index; + fse->key.element[i].in.offset = src->src_offset; + } + + + { + unsigned dst_offset = 0; + + for (i = 0; i < vinfo->num_attribs; i++) { + unsigned emit_sz = 0; + unsigned output_format = PIPE_FORMAT_NONE; + unsigned vs_output = vinfo->src_index[i]; + + switch (vinfo->emit[i]) { + case EMIT_4F: + output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; + emit_sz = 4 * sizeof(float); + break; + case EMIT_3F: + output_format = PIPE_FORMAT_R32G32B32_FLOAT; + emit_sz = 3 * sizeof(float); + break; + case EMIT_2F: + output_format = PIPE_FORMAT_R32G32_FLOAT; + emit_sz = 2 * sizeof(float); + break; + case EMIT_1F: + output_format = PIPE_FORMAT_R32_FLOAT; + emit_sz = 1 * sizeof(float); + break; + case EMIT_1F_PSIZE: + need_psize = 1; + output_format = PIPE_FORMAT_R32_FLOAT; + emit_sz = 1 * sizeof(float); + vs_output = num_vs_outputs + 1; + + break; + default: + assert(0); + break; + } + + /* The elements in the key correspond to vertex shader output + * numbers, not to positions in the hw vertex description -- + * that's handled by the output_offset field. + */ + fse->key.element[vs_output].out.format = output_format; + fse->key.element[vs_output].out.offset = dst_offset; + + dst_offset += emit_sz; + assert(fse->key.output_stride >= dst_offset); + } + } + + /* To make psize work, really need to tell the vertex shader to + * copy that value from input->output. For 'translate' this was + * implicit for all elements. + */ +#if 0 + if (need_psize) { + unsigned input = num_vs_inputs + 1; + const struct pipe_vertex_element *src = &draw->pt.vertex_element[i]; + fse->key.element[i].input_format = PIPE_FORMAT_R32_FLOAT; + fse->key.element[i].input_buffer = 0; //nr_buffers + 1; + fse->key.element[i].input_offset = 0; + + fse->key.nr_elements += 1; + + } +#endif + + /* Would normally look up a vertex shader and peruse its list of + * varients somehow. We omitted that step and put all the + * hardcoded "shaders" into an array. We're just making the + * assumption that this happens to be a matching shader... ie + * you're running isosurf, aren't you? + */ + fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, + &fse->key ); + + if (!fse->active) { + assert(0); + return ; + } + + /* Now set buffer pointers: + */ + for (i = 0; i < num_vs_inputs; i++) { + unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index; + + fse->active->set_input( fse->active, + i, + + ((const ubyte *) draw->pt.user.vbuffer[buf] + + draw->pt.vertex_buffer[buf].buffer_offset), + + draw->pt.vertex_buffer[buf].pitch ); + } + + fse->active->set_constants( fse->active, + (const float (*)[4])draw->pt.user.constants ); + + //return TRUE; +} + + + + + + + +static void fse_run_linear( struct draw_pt_middle_end *middle, + unsigned start, + unsigned count ) +{ + struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; + struct draw_context *draw = fse->draw; + unsigned alloc_count = align(count, 4); + char *hw_verts; + + /* XXX: need to flush to get prim_vbuf.c to release its allocation?? + */ + draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + + hw_verts = draw->render->allocate_vertices( draw->render, + (ushort)fse->key.output_stride, + (ushort)alloc_count ); + + if (!hw_verts) { + assert(0); + return; + } + + /* Single routine to fetch vertices, run shader and emit HW verts. + * Clipping and viewport transformation are done elsewhere -- + * either by the API or on hardware, or for some other reason not + * required... + */ + fse->active->run_linear( fse->active, + start, count, + hw_verts ); + + /* Draw arrays path to avoid re-emitting index list again and + * again. + */ + draw->render->draw_arrays( draw->render, + 0, + count ); + + + draw->render->release_vertices( draw->render, + hw_verts, + fse->key.output_stride, + count ); +} + + +static void +fse_run(struct draw_pt_middle_end *middle, + const unsigned *fetch_elts, + unsigned fetch_count, + const ushort *draw_elts, + unsigned draw_count ) +{ + struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; + struct draw_context *draw = fse->draw; + unsigned alloc_count = align(fetch_count, 4); + void *hw_verts; + + /* XXX: need to flush to get prim_vbuf.c to release its allocation?? + */ + draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + + hw_verts = draw->render->allocate_vertices( draw->render, + (ushort)fse->key.output_stride, + (ushort)alloc_count ); + if (!hw_verts) { + assert(0); + return; + } + + + /* Single routine to fetch vertices, run shader and emit HW verts. + */ + fse->active->run_elts( fse->active, + fetch_elts, + fetch_count, + hw_verts ); + + draw->render->draw( draw->render, + draw_elts, + draw_count ); + + draw->render->release_vertices( draw->render, + hw_verts, + fse->key.output_stride, + fetch_count ); + +} + + +static void fse_finish( struct draw_pt_middle_end *middle ) +{ +} + + +static void +fse_destroy( struct draw_pt_middle_end *middle ) +{ + FREE(middle); +} + +struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw ) +{ + struct fetch_shade_emit *fse = CALLOC_STRUCT(fetch_shade_emit); + if (!fse) + return NULL; + + fse->base.prepare = fse_prepare; + fse->base.run = fse_run; + fse->base.run_linear = fse_run_linear; + fse->base.finish = fse_finish; + fse->base.destroy = fse_destroy; + fse->draw = draw; + + return &fse->base; +} diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index 4142dd9589..9b899d404e 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -36,6 +36,8 @@ #include "draw_private.h" #include "draw_context.h" #include "draw_vs.h" +#include "translate/translate.h" +#include "translate/translate_cache.h" @@ -90,11 +92,25 @@ boolean draw_vs_init( struct draw_context *draw ) { tgsi_exec_machine_init(&draw->vs.machine); + /* FIXME: give this machine thing a proper constructor: */ draw->vs.machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); - draw->vs.machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); + if (!draw->vs.machine.Inputs) + return FALSE; + draw->vs.machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); + if (!draw->vs.machine.Outputs) + return FALSE; + + draw->vs.emit_cache = translate_cache_create(); + if (!draw->vs.emit_cache) + return FALSE; + + draw->vs.fetch_cache = translate_cache_create(); + if (!draw->vs.fetch_cache) + return FALSE; + return TRUE; } @@ -107,6 +123,71 @@ draw_vs_destroy( struct draw_context *draw ) if (draw->vs.machine.Outputs) align_free(draw->vs.machine.Outputs); + if (draw->vs.fetch_cache) + translate_cache_destroy(draw->vs.fetch_cache); + + if (draw->vs.emit_cache) + translate_cache_destroy(draw->vs.emit_cache); + tgsi_exec_machine_free_data(&draw->vs.machine); } + + +struct draw_vs_varient * +draw_vs_lookup_varient( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + struct draw_vs_varient *varient; + unsigned i; + + /* Lookup existing varient: + */ + for (i = 0; i < vs->nr_varients; i++) + if (draw_vs_varient_key_compare(key, &vs->varient[i]->key) == 0) + return vs->varient[i]; + + /* Else have to create a new one: + */ + varient = vs->create_varient( vs, key ); + if (varient == NULL) + return NULL; + + /* Add it to our list: + */ + assert(vs->nr_varients < Elements(vs->varient)); + vs->varient[vs->nr_varients++] = varient; + + /* Done + */ + return varient; +} + + +struct translate * +draw_vs_get_fetch( struct draw_context *draw, + struct translate_key *key ) +{ + if (!draw->vs.fetch || + translate_key_compare(&draw->vs.fetch->key, key) != 0) + { + translate_key_sanitize(key); + draw->vs.fetch = translate_cache_find(draw->vs.fetch_cache, key); + } + + return draw->vs.fetch; +} + +struct translate * +draw_vs_get_emit( struct draw_context *draw, + struct translate_key *key ) +{ + if (!draw->vs.emit || + translate_key_compare(&draw->vs.emit->key, key) != 0) + { + translate_key_sanitize(key); + draw->vs.emit = translate_cache_find(draw->vs.emit_cache, key); + } + + return draw->vs.emit; +} diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index f9772b83b8..677be0d28d 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -38,10 +38,63 @@ struct draw_context; struct pipe_shader_state; +struct draw_vs_input +{ + enum pipe_format format; + unsigned buffer; + unsigned offset; +}; + +struct draw_vs_output +{ + enum pipe_format format; + unsigned offset; +}; + +struct draw_vs_element { + struct draw_vs_input in; + struct draw_vs_output out; +}; + +struct draw_vs_varient_key { + unsigned output_stride; + unsigned nr_elements; + struct draw_vs_element element[PIPE_MAX_ATTRIBS]; +}; + +struct draw_vs_varient { + struct draw_vs_varient_key key; + + struct draw_vertex_shader *vs; + + void (*set_input)( struct draw_vs_varient *, + unsigned i, + const void *ptr, + unsigned stride ); + + void (*set_constants)( struct draw_vs_varient *, + const float (*constants)[4] ); + + + void (*run_linear)( struct draw_vs_varient *shader, + unsigned start, + unsigned count, + void *output_buffer ); + + void (*run_elts)( struct draw_vs_varient *shader, + const unsigned *elts, + unsigned count, + void *output_buffer ); + + void (*destroy)( struct draw_vs_varient * ); +}; + + /** * Private version of the compiled vertex_shader */ struct draw_vertex_shader { + struct draw_context *draw; /* This member will disappear shortly: */ @@ -49,6 +102,14 @@ struct draw_vertex_shader { struct tgsi_shader_info info; + /* + */ + struct draw_vs_varient *varient[16]; + unsigned nr_varients; + struct draw_vs_varient *(*create_varient)( struct draw_vertex_shader *shader, + const struct draw_vs_varient_key *key ); + + void (*prepare)( struct draw_vertex_shader *shader, struct draw_context *draw ); @@ -68,6 +129,15 @@ struct draw_vertex_shader { }; +struct draw_vs_varient * +draw_vs_lookup_varient( struct draw_vertex_shader *base, + const struct draw_vs_varient_key *key ); + + +/******************************************************************************** + * Internal functions: + */ + struct draw_vertex_shader * draw_create_vs_exec(struct draw_context *draw, const struct pipe_shader_state *templ); @@ -80,8 +150,43 @@ struct draw_vertex_shader * draw_create_vs_llvm(struct draw_context *draw, const struct pipe_shader_state *templ); +/******************************************************************************** + * Helpers for vs implementations that don't do their own fetch/emit varients. + * Means these can be shared between shaders. + */ +struct translate; +struct translate_key; + +struct translate *draw_vs_get_fetch( struct draw_context *draw, + struct translate_key *key ); + + +struct translate *draw_vs_get_emit( struct draw_context *draw, + struct translate_key *key ); + +struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ); + + + +static INLINE int draw_vs_varient_keysize( const struct draw_vs_varient_key *key ) +{ + return 2 * sizeof(int) + key->nr_elements * sizeof(struct draw_vs_element); +} + +static INLINE int draw_vs_varient_key_compare( const struct draw_vs_varient_key *a, + const struct draw_vs_varient_key *b ) +{ + int keysize = draw_vs_varient_keysize(a); + return memcmp(a, b, keysize); +} + + + + #define MAX_TGSI_VERTICES 4 + #endif diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index cb80d008cd..4501877efc 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -179,9 +179,11 @@ draw_create_vs_exec(struct draw_context *draw, tgsi_scan_shader(state->tokens, &vs->base.info); + vs->base.draw = draw; vs->base.prepare = vs_exec_prepare; vs->base.run_linear = vs_exec_run_linear; vs->base.delete = vs_exec_delete; + vs->base.create_varient = draw_vs_varient_generic; vs->machine = &draw->vs.machine; return &vs->base; diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c index 171da51dd5..621472ec7c 100644 --- a/src/gallium/auxiliary/draw/draw_vs_llvm.c +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -114,7 +114,9 @@ draw_create_vs_llvm(struct draw_context *draw, tgsi_scan_shader(vs->base.state.tokens, &vs->base.info); + vs->base.draw = draw; vs->base.prepare = vs_llvm_prepare; + vs->base.create_varient = draw_vs_varient_generic; vs->base.run_linear = vs_llvm_run_linear; vs->base.delete = vs_llvm_delete; vs->machine = &draw->machine; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index 13ad032bd3..df94a7e0c7 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -155,6 +155,8 @@ draw_create_vs_sse(struct draw_context *draw, tgsi_scan_shader(templ->tokens, &vs->base.info); + vs->base.draw = draw; + vs->base.create_varient = draw_vs_varient_generic; vs->base.prepare = vs_sse_prepare; vs->base.run_linear = vs_sse_run_linear; vs->base.delete = vs_sse_delete; diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c new file mode 100644 index 0000000000..d27b0f6187 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -0,0 +1,229 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell + */ + +#include "pipe/p_util.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_vbuf.h" +#include "draw/draw_vertex.h" +#include "draw/draw_vs.h" +#include "translate/translate.h" +#include "translate/translate_cache.h" + +/* A first pass at incorporating vertex fetch/emit functionality into + */ +struct draw_vs_varient_generic { + struct draw_vs_varient base; + + + + struct draw_vertex_shader *shader; + struct draw_context *draw; + + /* Basic plan is to run these two translate functions before/after + * the vertex shader's existing run_linear() routine to simulate + * the inclusion of this functionality into the shader... + * + * Next will look at actually including it. + */ + struct translate *fetch; + struct translate *emit; + + const float (*constants)[4]; +}; + + + + +static void vsvg_set_constants( struct draw_vs_varient *varient, + const float (*constants)[4] ) +{ + struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; + + vsvg->constants = constants; +} + + +static void vsvg_set_input( struct draw_vs_varient *varient, + unsigned buffer, + const void *ptr, + unsigned stride ) +{ + struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; + + vsvg->fetch->set_buffer(vsvg->fetch, + buffer, + ptr, + stride); +} + + +static void vsvg_run_elts( struct draw_vs_varient *varient, + const unsigned *elts, + unsigned count, + void *output_buffer) +{ + struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; + + /* Want to do this in small batches for cache locality? + */ + + vsvg->fetch->run_elts( vsvg->fetch, + elts, + count, + output_buffer ); + + //if (!vsvg->base.vs->is_passthrough) + { + vsvg->base.vs->run_linear( vsvg->base.vs, + output_buffer, + output_buffer, + vsvg->constants, + count, + vsvg->base.key.output_stride, + vsvg->base.key.output_stride); + + //if (!vsvg->already_in_emit_format) + + vsvg->emit->set_buffer( vsvg->emit, + 0, + output_buffer, + vsvg->base.key.output_stride ); + + + vsvg->emit->run( vsvg->emit, + 0, count, + output_buffer ); + } +} + + +static void vsvg_run_linear( struct draw_vs_varient *varient, + unsigned start, + unsigned count, + void *output_buffer ) +{ + struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; + + //debug_printf("%s %d %d\n", __FUNCTION__, start, count); + + + vsvg->fetch->run( vsvg->fetch, + start, + count, + output_buffer ); + + //if (!vsvg->base.vs->is_passthrough) + { + vsvg->base.vs->run_linear( vsvg->base.vs, + output_buffer, + output_buffer, + vsvg->constants, + count, + vsvg->base.key.output_stride, + vsvg->base.key.output_stride); + + //if (!vsvg->already_in_emit_format) + vsvg->emit->set_buffer( vsvg->emit, + 0, + output_buffer, + vsvg->base.key.output_stride ); + + + vsvg->emit->run( vsvg->emit, + 0, count, + output_buffer ); + } +} + + + +static void vsvg_destroy( struct draw_vs_varient *varient ) +{ + FREE(varient); +} + + +struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + unsigned i; + struct translate_key fetch, emit; + + struct draw_vs_varient_generic *vsvg = CALLOC_STRUCT( draw_vs_varient_generic ); + if (vsvg == NULL) + return NULL; + + vsvg->base.key = *key; + vsvg->base.vs = vs; + vsvg->base.set_input = vsvg_set_input; + vsvg->base.set_constants = vsvg_set_constants; + vsvg->base.run_elts = vsvg_run_elts; + vsvg->base.run_linear = vsvg_run_linear; + vsvg->base.destroy = vsvg_destroy; + + + + /* OK, have to build a new one: + */ + fetch.nr_elements = vs->info.num_inputs; + fetch.output_stride = 0; + for (i = 0; i < vs->info.num_inputs; i++) { + fetch.element[i].input_format = key->element[i].in.format; + fetch.element[i].input_buffer = key->element[i].in.buffer; + fetch.element[i].input_offset = key->element[i].in.offset; + fetch.element[i].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; + fetch.element[i].output_offset = fetch.output_stride; + fetch.output_stride += 4 * sizeof(float); + } + + + emit.nr_elements = vs->info.num_outputs; + emit.output_stride = key->output_stride; + for (i = 0; i < vs->info.num_outputs; i++) { + emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT; + emit.element[i].input_buffer = 0; + emit.element[i].input_offset = i * 4 * sizeof(float); + emit.element[i].output_format = key->element[i].out.format; + emit.element[i].output_offset = key->element[i].out.offset; + } + + vsvg->fetch = draw_vs_get_fetch( vs->draw, &fetch ); + vsvg->emit = draw_vs_get_emit( vs->draw, &emit ); + + return &vsvg->base; +} + + + + + -- cgit v1.2.3 From 1ba10e5ccf5cd0c990922e982e1e9bc6be48a5e4 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 09:44:16 +0100 Subject: draw: add aos vertex shader varient --- src/gallium/auxiliary/draw/Makefile | 2 + src/gallium/auxiliary/draw/draw_vs.h | 10 + src/gallium/auxiliary/draw/draw_vs_aos.c | 1739 +++++++++++++++++++++++++++ src/gallium/auxiliary/draw/draw_vs_aos.h | 181 +++ src/gallium/auxiliary/draw/draw_vs_aos_io.c | 314 +++++ src/gallium/auxiliary/draw/draw_vs_sse.c | 1 + 6 files changed, 2247 insertions(+) create mode 100644 src/gallium/auxiliary/draw/draw_vs_aos.c create mode 100644 src/gallium/auxiliary/draw/draw_vs_aos.h create mode 100644 src/gallium/auxiliary/draw/draw_vs_aos_io.c (limited to 'src/gallium/auxiliary/draw/draw_vs_sse.c') diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile index 84877994fb..9a88ecc070 100644 --- a/src/gallium/auxiliary/draw/Makefile +++ b/src/gallium/auxiliary/draw/Makefile @@ -35,6 +35,8 @@ C_SOURCES = \ draw_vertex.c \ draw_vs.c \ draw_vs_varient.c \ + draw_vs_aos.c \ + draw_vs_aos_io.c \ draw_vs_exec.c \ draw_vs_llvm.c \ draw_vs_sse.c diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index 6bfc2c8d75..5a8d0da06d 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -162,6 +162,16 @@ struct draw_vertex_shader * draw_create_vs_llvm(struct draw_context *draw, const struct pipe_shader_state *templ); + + +struct draw_vs_varient_key; +struct draw_vertex_shader; + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ); + + + /******************************************************************************** * Helpers for vs implementations that don't do their own fetch/emit varients. * Means these can be shared between shaders. diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c new file mode 100644 index 0000000000..620f5e3592 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -0,0 +1,1739 @@ +/* + * Mesa 3-D graphics library + * Version: 6.3 + * + * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code + * using the rtasm runtime assembler. Based on the old + * t_vb_arb_program_sse.c + */ + + +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/util/tgsi_parse.h" +#include "tgsi/util/tgsi_util.h" +#include "tgsi/exec/tgsi_exec.h" +#include "tgsi/util/tgsi_dump.h" + +#include "draw_vs.h" +#include "draw_vs_aos.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 + + +#define DISASSEM 0 + + + + + +static INLINE boolean eq( struct x86_reg a, + struct x86_reg b ) +{ + return (a.file == b.file && + a.idx == b.idx && + a.mod == b.mod && + a.disp == b.disp); +} + + +static struct x86_reg get_reg_ptr(struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + struct x86_reg ptr = cp->machine_EDX; + + switch (file) { + case TGSI_FILE_INPUT: + return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); + + case TGSI_FILE_OUTPUT: + return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); + + case TGSI_FILE_TEMPORARY: + return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); + + case TGSI_FILE_IMMEDIATE: + return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + + case TGSI_FILE_CONSTANT: + return x86_make_disp(ptr, Offset(struct aos_machine, constant[idx])); + + case AOS_FILE_INTERNAL: + return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + + default: + ERROR(cp, "unknown reg file"); + return x86_make_reg(0,0); + } +} + + +struct x86_reg aos_get_internal( struct aos_compilation *cp, + unsigned imm ) +{ + return get_reg_ptr( cp, + AOS_FILE_INTERNAL, + imm + 1 ); +} + +static void spill( struct aos_compilation *cp, unsigned idx ) +{ + if (!cp->xmm[idx].dirty || + (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ + cp->xmm[idx].file != TGSI_FILE_OUTPUT && + cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { + ERROR(cp, "invalid spill"); + return; + } + else { + struct x86_reg oldval = get_reg_ptr(cp, + cp->xmm[idx].file, + cp->xmm[idx].idx); + + assert(cp->xmm[idx].dirty); + sse_movups(cp->func, oldval, x86_make_reg(file_XMM, idx)); + cp->xmm[idx].dirty = 0; + } +} + +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) +{ + unsigned i; + unsigned oldest = 0; + + for (i = 0; i < 8; i++) + if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) + oldest = i; + + /* Need to write out the old value? + */ + if (cp->xmm[oldest].dirty) + spill(cp, oldest); + + assert(cp->xmm[oldest].last_used != cp->insn_counter); + + cp->xmm[oldest].file = TGSI_FILE_NULL; + cp->xmm[oldest].idx = 0; + cp->xmm[oldest].last_used = cp->insn_counter; + return x86_make_reg(file_XMM, oldest); +} + +void aos_release_xmm_reg( struct aos_compilation *cp, + unsigned idx ) +{ + cp->xmm[idx].file = TGSI_FILE_NULL; + cp->xmm[idx].idx = 0; + cp->xmm[idx].dirty = 0; + cp->xmm[idx].last_used = 0; +} + +static void invalidate_xmm( struct aos_compilation *cp, + unsigned file, unsigned idx ) +{ + unsigned i; + + /* Invalidate any old copy of this register in XMM0-7. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { + + if (cp->xmm[i].dirty) + spill(cp, i); + + aos_release_xmm_reg(cp, i); + break; + } + } + + for (; i < 8; i++) { + if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { + assert(0); + } + } +} + + +void aos_adopt_xmm_reg( struct aos_compilation *cp, + struct x86_reg reg, + unsigned file, + unsigned idx, + unsigned dirty ) +{ + if (reg.file != file_XMM) { + assert(0); + return; + } + + invalidate_xmm(cp, file, idx); + cp->xmm[reg.idx].file = file; + cp->xmm[reg.idx].idx = idx; + cp->xmm[reg.idx].dirty = dirty; +} + + + +static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + invalidate_xmm( cp, file, idx ); + return get_reg_ptr( cp, file, idx ); +} + + +/* As above, but return a pointer. Note - this pointer may alias + * those returned by get_arg_ptr(). + */ +static struct x86_reg get_dst_ptr( struct aos_compilation *cp, + const struct tgsi_full_dst_register *dst ) +{ + return aos_get_shader_reg_ptr( cp, dst->DstRegister.File, dst->DstRegister.Index ); +} + + + + + +/* Return an XMM reg if the argument is resident, otherwise return a + * base+offset pointer to the saved value. + */ +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + unsigned i; + + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx) + { + cp->xmm[i].last_used = cp->insn_counter; + return x86_make_reg(file_XMM, i); + } + } + + /* If not found in the XMM register file, return an indirect + * reference to the in-memory copy: + */ + return get_reg_ptr( cp, file, idx ); +} + + + + + +/* Emulate pshufd insn in regular SSE, if necessary: + */ +static void emit_pshufd( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + ubyte shuf ) +{ + if (cp->have_sse2) { + sse2_pshufd(cp->func, dst, arg0, shuf); + } + else { + if (!eq(dst, arg0)) + sse_movups(cp->func, dst, arg0); + + sse_shufps(cp->func, dst, dst, shuf); + } +} + + + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy1( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + struct x86_reg arg1, + ubyte shuf ) +{ + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, arg1); + emit_pshufd(cp, dst, dst, shuf); + emit_pshufd(cp, tmp, arg0, shuf); + + sse_movss(cp->func, dst, tmp); + + emit_pshufd(cp, dst, dst, shuf); + + aos_release_xmm_reg(cp, tmp.idx); + return TRUE; +} + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy2( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + struct x86_reg arg1, + ubyte shuf ) +{ + struct x86_reg tmp = aos_get_xmm_reg(cp); + emit_pshufd(cp, dst, arg1, shuf); + emit_pshufd(cp, tmp, arg0, shuf); + + sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); + + emit_pshufd(cp, dst, dst, shuf); + + aos_release_xmm_reg(cp, tmp.idx); + return TRUE; +} + +#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) + + +/* Locate a source register and perform any required (simple) swizzle. + * + * Just fail on complex swizzles at this point. + */ +static struct x86_reg fetch_src( struct aos_compilation *cp, + const struct tgsi_full_src_register *src ) +{ + struct x86_reg arg0 = aos_get_shader_reg(cp, + src->SrcRegister.File, + src->SrcRegister.Index); + unsigned i; + unsigned swz = 0; + unsigned negs = 0; + unsigned abs = 0; + + for (i = 0; i < 4; i++) { + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i ); + unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); + + switch (swizzle) { + case TGSI_EXTSWIZZLE_ZERO: + case TGSI_EXTSWIZZLE_ONE: + ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2"); + break; + + default: + swz |= (swizzle & 0x3) << (i * 2); + break; + } + + switch (neg) { + case TGSI_UTIL_SIGN_TOGGLE: + negs |= (1<func, dst, arg0); + + aos_release_xmm_reg(cp, tmp.idx); + arg0 = dst; + } + + if (abs && abs != 0xf) { + ERROR(cp, "unsupported partial abs"); + } + + if (abs) { + struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, tmp, arg0); + sse_mulps(cp->func, tmp, neg); + sse_maxps(cp->func, dst, arg0); + + aos_release_xmm_reg(cp, tmp.idx); + arg0 = dst; + } + } + + return arg0; +} + +static void x87_fld_src( struct aos_compilation *cp, + const struct tgsi_full_src_register *src, + unsigned channel ) +{ + struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, + src->SrcRegister.File, + src->SrcRegister.Index); + + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel ); + unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); + + switch (swizzle) { + case TGSI_EXTSWIZZLE_ZERO: + x87_fldz( cp->func ); + break; + + case TGSI_EXTSWIZZLE_ONE: + x87_fld1( cp->func ); + break; + + default: + x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); + break; + } + + + switch (neg) { + case TGSI_UTIL_SIGN_TOGGLE: + /* Flip the sign: + */ + x87_fchs( cp->func ); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + + case TGSI_UTIL_SIGN_CLEAR: + x87_fabs( cp->func ); + break; + + case TGSI_UTIL_SIGN_SET: + x87_fabs( cp->func ); + x87_fchs( cp->func ); + break; + + default: + ERROR(cp, "unsupported sign-mode"); + break; + } +} + + + + + + +/* Used to implement write masking. This and most of the other instructions + * here would be easier to implement if there had been a translation + * to a 2 argument format (dst/arg0, arg1) at the shader level before + * attempting to translate to x86/sse code. + */ +static void store_dest( struct aos_compilation *cp, + const struct tgsi_full_dst_register *reg, + struct x86_reg result ) +{ + if (reg->DstRegister.WriteMask == 0) + { + return; + } + else if (reg->DstRegister.WriteMask == TGSI_WRITEMASK_XYZW) + { + if (result.file == file_XMM) { + aos_adopt_xmm_reg(cp, + result, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); + } + else { + struct x86_reg dst = aos_get_xmm_reg(cp); + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); + sse_movups(cp->func, dst, result); + } + } + else + { + /* Previous value of the dest register: + */ + struct x86_reg old_dst = aos_get_shader_reg(cp, + reg->DstRegister.File, + reg->DstRegister.Index); + + + /* Alloc an xmm reg to hold the new value of the dest register: + */ + struct x86_reg dst = aos_get_xmm_reg(cp); + + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE ); + + switch (reg->DstRegister.WriteMask) { + case TGSI_WRITEMASK_X: + if (result.file == file_XMM) { + sse_movups(cp->func, dst, old_dst); + sse_movss(cp->func, dst, result); + } + else { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, old_dst); + sse_movss(cp->func, tmp, result); + sse_movss(cp->func, dst, tmp); + aos_release_xmm_reg(cp, tmp.idx); + } + break; + + case TGSI_WRITEMASK_XY: + sse_movups(cp->func, dst, old_dst); + sse_shufps(cp->func, dst, result, SHUF(X, Y, Z, W)); + break; + + case TGSI_WRITEMASK_ZW: + sse_movups(cp->func, dst, result); + sse_shufps(cp->func, dst, old_dst, SHUF(X, Y, Z, W)); + break; + + case TGSI_WRITEMASK_YZW: + if (old_dst.file == file_XMM) { + sse_movups(cp->func, dst, result); + sse_movss(cp->func, dst, old_dst); + } + else { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, result); + sse_movss(cp->func, tmp, old_dst); + sse_movss(cp->func, dst, tmp); + aos_release_xmm_reg(cp, tmp.idx); + } + break; + + case TGSI_WRITEMASK_Y: + emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Y,X,Z,W)); + break; + + case TGSI_WRITEMASK_Z: + emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); + break; + + case TGSI_WRITEMASK_W: + emit_shuf_copy1(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); + break; + + case TGSI_WRITEMASK_XZ: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,Z,Y,W)); + break; + + case TGSI_WRITEMASK_XW: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,W,Z,Y)); + + case TGSI_WRITEMASK_YZ: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); + break; + + case TGSI_WRITEMASK_YW: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); + break; + + case TGSI_WRITEMASK_XZW: + emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Y,X,Z,W)); + break; + + case TGSI_WRITEMASK_XYW: + emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Z,Y,X,W)); + break; + + case TGSI_WRITEMASK_XYZ: + emit_shuf_copy1(cp, dst, old_dst, result, SHUF(W,Y,Z,X)); + break; + + default: + assert(0); /* not possible */ + break; + } + } +} + + +static void x87_fst_or_nop( struct x86_function *func, + unsigned writemask, + unsigned channel, + struct x86_reg ptr ) +{ + if (writemask & (1<DstRegister.WriteMask; + + x87_fst_or_nop(cp->func, writemask, 0, ptr); + x87_fst_or_nop(cp->func, writemask, 1, ptr); + x87_fst_or_nop(cp->func, writemask, 2, ptr); + x87_fstp_or_pop(cp->func, writemask, 3, ptr); +} + +/* Save current x87 state and put it into single precision mode. + */ +static void save_fpu_state( struct aos_compilation *cp ) +{ +#if 0 + x87_fnstcw( cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); + x87_fldcw( cp->func, ); +#endif +} + +static void restore_fpu_state( struct aos_compilation *cp ) +{ +#if 0 + x87_fnclex(cp->func); + x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); +#endif +} + +static void set_fpu_round_neg_inf( struct aos_compilation *cp ) +{ +#if 0 + if (cp->fpucntl != RND_NEG_FPU) { + struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX); + struct arb_vp_machine *m = NULL; + + cp->fpucntl = RND_NEG_FPU; + x87_fnclex(cp->func); + x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg))); + } +#endif +} + +static void set_fpu_round_nearest( struct aos_compilation *cp ) +{ +#if 0 +#endif +} + + +static void emit_x87_ex2( struct aos_compilation *cp ) +{ + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st3 = x86_make_reg(file_x87, 3); + + set_fpu_round_neg_inf( cp ); + + x87_fld(cp->func, st0); /* a a */ + x87_fprndint( cp->func ); /* int(a) a */ + x87_fld(cp->func, st0); /* int(a) int(a) a */ + x87_fstp(cp->func, st3); /* int(a) a int(a)*/ + x87_fsubp(cp->func, st1); /* frac(a) int(a) */ + x87_f2xm1(cp->func); /* (2^frac(a))-1 int(a)*/ + x87_fld1(cp->func); /* 1 (2^frac(a))-1 int(a)*/ + x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ + x87_fscale(cp->func); /* 2^a */ +} + + + +/** + * The traditional instructions. All operate on internal registers + * and ignore write masks and swizzling issues. + */ + +static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, neg); + sse_maxps(cp->func, dst, arg0); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_addps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + x87_fcos(cp->func); + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + +/* The dotproduct instructions don't really do that well in sse: + */ +static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the first 3 values: + */ + sse_movhlps(cp->func, tmp, dst); + sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + + +static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the values: + */ + sse_movhlps(cp->func, tmp, dst); + sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the values (from DP3): + */ + sse_movhlps(cp->func, tmp, dst); + sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); + sse_addss(cp->func, dst, tmp); + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + +/* dst[0] = 1.0 * 1.0F; */ +/* dst[1] = arg0[1] * arg1[1]; */ +/* dst[2] = arg0[2] * 1.0; */ +/* dst[3] = 1.0 * arg1[3]; */ + + emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); + emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); + sse_mulps(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld1(cp->func); /* 1 */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */ + x87_fyl2x(cp->func); /* log2(a0) */ + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + +static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + + emit_x87_ex2(cp); + + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + +static boolean emit_EXP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st3 = x86_make_reg(file_x87, 3); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + /* CAUTION: dst may alias arg0! + */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */ + x87_fld(cp->func, st0); /* arg arg */ + + /* by default, fpu is setup to round-to-nearest. We want to + * change this now, and track the state through to the end of the + * generated function so that it isn't repeated unnecessarily. + * Alternately, could subtract .5 to get round to -inf behaviour. + */ + set_fpu_round_neg_inf( cp ); + x87_fprndint( cp->func ); /* flr(a) a */ + x87_fld(cp->func, st0); /* flr(a) flr(a) a */ + x87_fld1(cp->func); /* 1 floor(a) floor(a) a */ + x87_fst_or_nop(cp->func, writemask, 3, dst); /* stack unchanged */ + + x87_fscale(cp->func); /* 2^floor(a) floor(a) a */ + x87_fst(cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/ + + x87_fstp_or_pop(cp->func, writemask, 0, dst); /* flr(a) a 2^flr(a) */ + + x87_fsubrp(cp->func, st1); /* frac(a) 2^flr(a) */ + + x87_fst_or_nop(cp->func, writemask, 1, dst); /* frac(a) 2^flr(a) */ + + x87_f2xm1(cp->func); /* (2^frac(a))-1 2^flr(a)*/ + x87_fld1(cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/ + x87_faddp(cp->func, st1); /* 2^frac(a) 2^flr(a) */ + x87_fmulp(cp->func, st1); /* 2^a */ + + x87_fstp_or_pop(cp->func, writemask, 2, dst); + +/* dst[0] = 2^floor(tmp); */ +/* dst[1] = frac(tmp); */ +/* dst[2] = 2^floor(tmp) * 2^frac(tmp); */ +/* dst[3] = 1.0F; */ + return TRUE; +} + +static boolean emit_LOG( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st2 = x86_make_reg(file_x87, 2); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + /* CAUTION: dst may alias arg0! + */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */ + x87_fabs(cp->func); /* |arg0.x| */ + x87_fxtract(cp->func); /* mantissa(arg0.x), exponent(arg0.x) */ + x87_fst(cp->func, st2); /* mantissa, exponent, mantissa */ + x87_fld1(cp->func); /* 1, mantissa, exponent, mantissa */ + x87_fyl2x(cp->func); /* log2(mantissa), exponent, mantissa */ + x87_fadd(cp->func, st0, st1); /* e+l2(m), e, m */ + + x87_fstp_or_pop(cp->func, writemask, 2, dst); /* e, m */ + + x87_fld1(cp->func); /* 1, e, m */ + x87_fsub(cp->func, st1, st0); /* 1, e-1, m */ + + x87_fstp_or_pop(cp->func, writemask, 3, dst); /* e-1,m */ + x87_fstp_or_pop(cp->func, writemask, 0, dst); /* m */ + + x87_fadd(cp->func, st0, st0); /* 2m */ + + x87_fstp_or_pop( cp->func, writemask, 1, dst ); + + return TRUE; +} + +static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_neg_inf( cp ); + + /* Load all sources first to avoid aliasing + */ + for (i = 0; i < 4; i++) { + if (writemask & (1<FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<func ); + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + +static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_nearest( cp ); + + /* Load all sources first to avoid aliasing + */ + for (i = 0; i < 4; i++) { + if (writemask & (1<FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<func ); + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + +static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_neg_inf( cp ); + + /* suck all the source values onto the stack before writing out any + * dst, which may alias... + */ + for (i = 0; i < 4; i++) { + if (writemask & (1<FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<func, st0); /* a a */ + x87_fprndint( cp->func ); /* flr(a) a */ + x87_fsubrp(cp->func, st1); /* frc(a) */ + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + + +static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + unsigned fixup1, fixup2; + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + + /* Load the interesting parts of arg0: + */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 3); + x87_fld_src(cp, &op->FullSrcRegisters[0], 1); + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + + + if (writemask & TGSI_WRITEMASK_XW) { + x87_fld1(cp->func); + x87_fst_or_nop(cp->func, writemask, 0, dst); + x87_fstp_or_pop(cp->func, writemask, 3, dst); + } + + if (writemask & TGSI_WRITEMASK_YZ) { + + /* Pre-zero destinations, may be overwritten later... fixme. + */ + x87_fldz(cp->func); + x87_fst_or_nop(cp->func, writemask, 1, dst); + x87_fstp_or_pop(cp->func, writemask, 2, dst); + + + /* Check arg0[0]: + */ + x87_fldz(cp->func); /* 0 a0 a1 a3 */ + x87_fucomp(cp->func, st1); /* a0 a1 a3 */ + x87_fnstsw(cp->func, cp->tmp_EAX); + x86_sahf(cp->func); + fixup1 = x86_jcc_forward(cp->func, cc_AE); + + x87_fstp_or_pop(cp->func, writemask, 1, dst); /* a1 a3 */ + + /* Check arg0[1]: + */ + x87_fldz(cp->func); /* 0 a1 a3 */ + x87_fucomp(cp->func, st1); /* a1 a3 */ + x87_fnstsw(cp->func, cp->tmp_EAX); + x86_sahf(cp->func); + fixup2 = x86_jcc_forward(cp->func, cc_AE); + + /* Compute pow(a1, a3) + */ + x87_fyl2x(cp->func); /* a3*log2(a1) */ + + emit_x87_ex2( cp ); /* 2^(a3*log2(a1)) */ + + x87_fstp_or_pop(cp->func, writemask, 2, dst); + + /* Land jumps: + */ + x86_fixup_fwd_jump(cp->func, fixup1); + x86_fixup_fwd_jump(cp->func, fixup2); + } + + return TRUE; +} + + + +static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_maxps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_minps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + sse_addps(cp->func, dst, arg2); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */ + x87_fyl2x(cp->func); /* a1*log2(a0) */ + + emit_x87_ex2( cp ); /* 2^(a1*log2(a0)) */ + + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + +static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + if (cp->have_sse2) { + sse2_rcpss(cp->func, dst, arg0); + /* extend precision here... + */ + } + else { + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + sse_movss(cp->func, dst, ones); + sse_divss(cp->func, dst, arg0); + } + + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_rsqrtss(cp->func, dst, arg0); + + /* Extend precision here... + */ + + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + + sse_movups(cp->func, dst, arg0); + sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); + sse_andps(cp->func, dst, ones); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + x87_fsin(cp->func); + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + + +static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + + sse_movups(cp->func, dst, arg0); + sse_cmpps(cp->func, dst, arg1, cc_LessThan); + sse_andps(cp->func, dst, ones); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_subps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp0 = aos_get_xmm_reg(cp); + struct x86_reg tmp1 = aos_get_xmm_reg(cp); + + /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way + * to invalidate registers. This will come with better analysis + * (liveness analysis) of the incoming program. + */ + emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W)); + emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W)); + sse_mulps(cp->func, dst, tmp1); + emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W)); + emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); + sse_mulps(cp->func, tmp0, tmp1); + sse_subps(cp->func, dst, tmp0); + +/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ +/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ +/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ +/* dst[3] is undef */ + + + aos_release_xmm_reg(cp, tmp0.idx); + aos_release_xmm_reg(cp, tmp1.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + + +static boolean +emit_instruction( struct aos_compilation *cp, + struct tgsi_full_instruction *inst ) +{ + switch( inst->Instruction.Opcode ) { + case TGSI_OPCODE_MOV: + return emit_MOV( cp, inst ); + + case TGSI_OPCODE_LIT: + return emit_LIT(cp, inst); + + case TGSI_OPCODE_RCP: + return emit_RCP(cp, inst); + + case TGSI_OPCODE_RSQ: + return emit_RSQ(cp, inst); + + case TGSI_OPCODE_EXP: + return emit_EXP(cp, inst); + + case TGSI_OPCODE_LOG: + return emit_LOG(cp, inst); + + case TGSI_OPCODE_MUL: + return emit_MUL(cp, inst); + + case TGSI_OPCODE_ADD: + return emit_ADD(cp, inst); + + case TGSI_OPCODE_DP3: + return emit_DP3(cp, inst); + + case TGSI_OPCODE_DP4: + return emit_DP4(cp, inst); + + case TGSI_OPCODE_DST: + return emit_DST(cp, inst); + + case TGSI_OPCODE_MIN: + return emit_MIN(cp, inst); + + case TGSI_OPCODE_MAX: + return emit_MAX(cp, inst); + + case TGSI_OPCODE_SLT: + return emit_SLT(cp, inst); + + case TGSI_OPCODE_SGE: + return emit_SGE(cp, inst); + + case TGSI_OPCODE_MAD: + return emit_MAD(cp, inst); + + case TGSI_OPCODE_SUB: + return emit_SUB(cp, inst); + + case TGSI_OPCODE_LERP: +// return emit_LERP(cp, inst); + return FALSE; + + case TGSI_OPCODE_FRAC: + return emit_FRC(cp, inst); + + case TGSI_OPCODE_CLAMP: +// return emit_CLAMP(cp, inst); + return FALSE; + + case TGSI_OPCODE_FLOOR: + return emit_FLR(cp, inst); + + case TGSI_OPCODE_ROUND: + return emit_RND(cp, inst); + + case TGSI_OPCODE_EXPBASE2: + return emit_EX2(cp, inst); + + case TGSI_OPCODE_LOGBASE2: + return emit_LG2(cp, inst); + + case TGSI_OPCODE_POWER: + return emit_POW(cp, inst); + + case TGSI_OPCODE_CROSSPRODUCT: + return emit_XPD(cp, inst); + + case TGSI_OPCODE_ABS: + return emit_ABS(cp, inst); + + case TGSI_OPCODE_DPH: + return emit_DPH(cp, inst); + + case TGSI_OPCODE_COS: + return emit_COS(cp, inst); + + case TGSI_OPCODE_SIN: + return emit_SIN(cp, inst); + + case TGSI_OPCODE_END: + return TRUE; + + default: + return FALSE; + } +} + +static boolean note_immediate( struct aos_compilation *cp, + struct tgsi_full_immediate *imm ) +{ + unsigned pos = cp->num_immediates++; + unsigned j; + + for (j = 0; j < imm->Immediate.Size; j++) { + cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float; + } + + return TRUE; +} + + + + +static void find_last_write_outputs( struct aos_compilation *cp ) +{ + struct tgsi_parse_context parse; + unsigned this_instruction = 0; + unsigned i; + + tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); + + while (!tgsi_parse_end_of_tokens( &parse )) { + + tgsi_parse_token( &parse ); + + if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) + continue; + + for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { + if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File == + TGSI_FILE_OUTPUT) + { + unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index; + cp->output_last_write[idx] = this_instruction; + } + } + + this_instruction++; + } + + tgsi_parse_free( &parse ); +} + + +#define ARG_VARIENT 1 +#define ARG_START_ELTS 2 +#define ARG_COUNT 3 +#define ARG_OUTBUF 4 + + +static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, + boolean linear ) +{ + struct tgsi_parse_context parse; + struct aos_compilation cp; + unsigned fixup, label; + + tgsi_parse_init( &parse, varient->base.vs->state.tokens ); + + memset(&cp, 0, sizeof(cp)); + + cp.insn_counter = 1; + cp.vaos = varient; + cp.have_sse2 = 1; + cp.func = &varient->func[ linear ? 0 : 1 ]; + + cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX); + cp.idx_EBX = x86_make_reg(file_REG32, reg_BX); + cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX); + cp.machine_EDX = x86_make_reg(file_REG32, reg_DX); + cp.count_ESI = x86_make_reg(file_REG32, reg_SI); + + x86_init_func(cp.func); + + find_last_write_outputs(&cp); + + x86_push(cp.func, cp.idx_EBX); + x86_push(cp.func, cp.count_ESI); + + + /* Load arguments into regs: + */ + x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_VARIENT)); + x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); + x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); + x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); + + + /* Compare count to zero and possibly bail. + */ + x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); + x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); + fixup = x86_jcc_forward(cp.func, cc_E); + + /* Dig out the machine pointer from inside the varient arg + */ + x86_mov(cp.func, cp.machine_EDX, + x86_make_disp(cp.machine_EDX, + Offset( struct draw_vs_varient_aos_sse, machine ))); + + save_fpu_state( &cp ); + + /* Note address for loop jump + */ + label = x86_get_label(cp.func); + { + /* Fetch inputs... TODO: fetch lazily... + */ + if (!aos_fetch_inputs( &cp, linear )) + goto fail; + + /* Emit the shader: + */ + while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) + { + tgsi_parse_token( &parse ); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: + if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) + goto fail; + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) + goto fail; + break; + } + + cp.insn_counter++; + debug_printf("\n"); + } + + if (cp.error) + goto fail; + + /* Emit output... TODO: do this eagerly after the last write to a + * given output. + */ + if (!aos_emit_outputs( &cp )) + goto fail; + + + /* Next vertex: + */ + x86_lea(cp.func, + cp.outbuf_ECX, + x86_make_disp(cp.outbuf_ECX, + cp.vaos->base.key.output_stride)); + + /* Incr index + */ + if (linear) { + x86_inc(cp.func, cp.idx_EBX); + } + else { + x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4)); + } + + } + /* decr count, loop if not zero + */ + x86_dec(cp.func, cp.count_ESI); +/* x86_test(cp.func, cp.count_ESI, cp.count_ESI); */ + x86_jcc(cp.func, cc_NZ, label); + + restore_fpu_state(&cp); + + /* Land forward jump here: + */ + x86_fixup_fwd_jump(cp.func, fixup); + + /* Exit mmx state? + */ + if (cp.func->need_emms) + mmx_emms(cp.func); + + x86_pop(cp.func, cp.count_ESI); + x86_pop(cp.func, cp.idx_EBX); + + x86_ret(cp.func); + + tgsi_parse_free( &parse ); + return !cp.error; + + fail: + tgsi_parse_free( &parse ); + return FALSE; +} + + + +static void vaos_set_buffer( struct draw_vs_varient *varient, + unsigned buf, + const void *ptr, + unsigned stride ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + unsigned i; + + for (i = 0; i < vaos->base.vs->info.num_inputs; i++) { + if (vaos->base.key.element[i].in.buffer == buf) { + vaos->machine->attrib[i].input_ptr = ((char *)ptr + + vaos->base.key.element[i].in.offset); + vaos->machine->attrib[i].input_stride = stride; + } + } +} + + +static void vaos_destroy( struct draw_vs_varient *varient ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + if (vaos->machine) + align_free( vaos->machine ); + + x86_release_func( &vaos->func[0] ); + x86_release_func( &vaos->func[1] ); + + FREE(vaos); +} + +static void vaos_run_elts( struct draw_vs_varient *varient, + const unsigned *elts, + unsigned count, + void *output_buffer ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + vaos->gen_run_elts( varient, + elts, + count, + output_buffer ); +} + +static void vaos_run_linear( struct draw_vs_varient *varient, + unsigned start, + unsigned count, + void *output_buffer ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + vaos->gen_run_linear( varient, + start, + count, + output_buffer ); +} + + +static void vaos_set_constants( struct draw_vs_varient *varient, + const float (*constants)[4] ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + memcpy(vaos->machine->constant, + constants, + (vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1) * 4 * sizeof(float)); +} + + +static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); + + if (!vaos) + goto fail; + + vaos->base.key = *key; + vaos->base.vs = vs; + vaos->base.set_input = vaos_set_buffer; + vaos->base.set_constants = vaos_set_constants; + vaos->base.destroy = vaos_destroy; + vaos->base.run_linear = vaos_run_linear; + vaos->base.run_elts = vaos_run_elts; + + vaos->machine = align_malloc( sizeof(struct aos_machine), 16 ); + if (!vaos->machine) + goto fail; + + memset(vaos->machine, 0, sizeof(struct aos_machine)); + + tgsi_dump(vs->state.tokens, 0); + + if (!build_vertex_program( vaos, TRUE )) + goto fail; + + if (!build_vertex_program( vaos, FALSE )) + goto fail; + + vaos->gen_run_linear = (vsv_run_linear_func)x86_get_func(&vaos->func[0]); + if (!vaos->gen_run_linear) + goto fail; + + vaos->gen_run_elts = (vsv_run_elts_func)x86_get_func(&vaos->func[1]); + if (!vaos->gen_run_elts) + goto fail; + + return &vaos->base; + + fail: + if (vaos->machine) + align_free( vaos->machine ); + + if (vaos) + x86_release_func( &vaos->func[0] ); + + if (vaos) + x86_release_func( &vaos->func[1] ); + + FREE(vaos); + + return NULL; +} + + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + struct draw_vs_varient *varient = varient_aos_sse( vs, key ); + + if (varient == NULL) { + assert(0); + varient = draw_vs_varient_generic( vs, key ); + } + + return varient; +} + + + +#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h new file mode 100644 index 0000000000..1d8a055a90 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -0,0 +1,181 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* Authors: Keith Whitwell + */ + +#ifndef DRAW_VS_AOS_H +#define DRAW_VS_AOS_H + + +struct tgsi_token; +struct x86_function; + +#include "pipe/p_state.h" +#include "rtasm/rtasm_x86sse.h" + + + + + +#define X 0 +#define Y 1 +#define Z 2 +#define W 3 + +#define MAX_INPUTS PIPE_MAX_ATTRIBS +#define MAX_OUTPUTS PIPE_MAX_ATTRIBS +#define MAX_TEMPS PIPE_MAX_ATTRIBS /* say */ +#define MAX_CONSTANTS PIPE_MAX_ATTRIBS /* say */ +#define MAX_IMMEDIATES PIPE_MAX_ATTRIBS /* say */ +#define MAX_INTERNALS 4 + +#define AOS_FILE_INTERNAL TGSI_FILE_COUNT + +/* This is the temporary storage used by all the aos_sse vs varients. + * Create one per context and reuse by passing a pointer in at + * vs_varient creation?? + */ +struct aos_machine { + float input [MAX_INPUTS ][4]; + float output [MAX_OUTPUTS ][4]; + float temp [MAX_TEMPS ][4]; + float constant [MAX_CONSTANTS ][4]; /* fixme -- should just be a pointer */ + float immediate[MAX_IMMEDIATES][4]; /* fixme -- should just be a pointer */ + float internal [MAX_INTERNALS ][4]; + + unsigned fpu_round_nearest; + unsigned fpu_round_neg_inf; + + struct { + const void *input_ptr; + unsigned input_stride; + + unsigned output_offset; + } attrib[PIPE_MAX_ATTRIBS]; +}; + + + + +struct aos_compilation { + struct x86_function *func; + struct draw_vs_varient_aos_sse *vaos; + + unsigned insn_counter; + unsigned num_immediates; + + struct { + unsigned idx:16; + unsigned file:8; + unsigned dirty:8; + unsigned last_used; + } xmm[8]; + + + boolean input_fetched[PIPE_MAX_ATTRIBS]; + unsigned output_last_write[PIPE_MAX_ATTRIBS]; + + boolean have_sse2; + boolean error; + short fpucntl; + + /* these are actually known values, but putting them in a struct + * like this is helpful to keep them in sync across the file. + */ + struct x86_reg tmp_EAX; + struct x86_reg idx_EBX; /* either start+i or &elt[i] */ + struct x86_reg outbuf_ECX; + struct x86_reg machine_EDX; + struct x86_reg count_ESI; /* decrements to zero */ +}; + +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ); +void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx ); + +void aos_adopt_xmm_reg( struct aos_compilation *cp, + struct x86_reg reg, + unsigned file, + unsigned idx, + unsigned dirty ); + +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, + unsigned file, + unsigned idx ); + +boolean aos_fetch_inputs( struct aos_compilation *cp, + boolean linear ); + +boolean aos_emit_outputs( struct aos_compilation *cp ); + + +#define IMM_ONES 0 /* 1, 1,1,1 */ +#define IMM_NEGS 1 /* 1,-1,0,0 */ +#define IMM_IDENTITY 2 /* 0, 0,0,1 */ +#define IMM_INV_255 3 /* 1/255, 1/255, 1/255, 1/255 */ +#define IMM_255 4 /* 255, 255, 255, 255 */ + +struct x86_reg aos_get_internal( struct aos_compilation *cp, + unsigned imm ); + + +#define ERROR(cp, msg) \ +do { \ + debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \ + cp->error = 1; \ + assert(0); \ +} while (0) + + + + + + +struct draw_vs_varient_aos_sse { + struct draw_vs_varient base; + struct draw_context *draw; + +#if 0 + struct { + const void *ptr; + unsigned stride; + } attrib[PIPE_MAX_ATTRIBS]; +#endif + + struct aos_machine *machine; /* XXX: temporarily unshared */ + + vsv_run_linear_func gen_run_linear; + vsv_run_elts_func gen_run_elts; + + + struct x86_function func[2]; +}; + + + +#endif + diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c new file mode 100644 index 0000000000..72b2b3d11d --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -0,0 +1,314 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/util/tgsi_parse.h" +#include "tgsi/util/tgsi_util.h" +#include "tgsi/exec/tgsi_exec.h" +#include "draw_vs.h" +#include "draw_vs_aos.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 + +/* Note - don't yet have to worry about interacting with the code in + * draw_vs_aos.c as there is no intermingling of generated code... + * That may have to change, we'll see. + */ +static void emit_load_R32G32B32A32( struct aos_compilation *cp, + struct x86_reg data, + struct x86_reg src_ptr ) +{ + sse_movups(cp->func, data, src_ptr); +} + +static void emit_load_R32G32B32( struct aos_compilation *cp, + struct x86_reg data, + struct x86_reg src_ptr ) +{ + sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); + sse_shufps(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); + sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) ); + sse_movlps(cp->func, data, src_ptr); +} + +static void emit_load_R32G32( struct aos_compilation *cp, + struct x86_reg data, + struct x86_reg src_ptr ) +{ + sse_movups(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ) ); + sse_movlps(cp->func, data, src_ptr); +} + + +static void emit_load_R32( struct aos_compilation *cp, + struct x86_reg data, + struct x86_reg src_ptr ) +{ + sse_movss(cp->func, data, src_ptr); + sse_orps(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ) ); +} + + +static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, + struct x86_reg data, + struct x86_reg src_ptr ) +{ + sse_movss(cp->func, data, src_ptr); + sse2_punpcklbw(cp->func, data, aos_get_internal( cp, IMM_IDENTITY )); + sse2_punpcklbw(cp->func, data, aos_get_internal( cp, IMM_IDENTITY )); + sse2_cvtdq2ps(cp->func, data, data); + sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255)); +} + + + +static void get_src_ptr( struct x86_function *func, + struct x86_reg src, + struct x86_reg machine, + struct x86_reg elt, + unsigned a ) +{ + struct x86_reg input_ptr = + x86_make_disp(machine, + Offset(struct aos_machine, attrib[a].input_ptr)); + + struct x86_reg input_stride = + x86_make_disp(machine, + Offset(struct aos_machine, attrib[a].input_stride)); + + /* Calculate pointer to current attrib: + */ + x86_mov(func, src, input_stride); + x86_imul(func, src, elt); + x86_add(func, src, input_ptr); +} + + +/* Extended swizzles? Maybe later. + */ +static void emit_swizzle( struct aos_compilation *cp, + struct x86_reg dest, + struct x86_reg src, + unsigned shuffle ) +{ + sse_shufps(cp->func, dest, src, shuffle); +} + + +static boolean load_input( struct aos_compilation *cp, + unsigned idx, + boolean linear ) +{ + unsigned format = cp->vaos->base.key.element[idx].in.format; + struct x86_reg src = cp->tmp_EAX; + struct x86_reg dataXMM = aos_get_xmm_reg(cp); + + /* Figure out source pointer address: + */ + get_src_ptr(cp->func, + src, + cp->machine_EDX, + linear ? cp->idx_EBX : x86_deref(cp->idx_EBX), + idx); + + src = x86_deref(src); + + aos_adopt_xmm_reg( cp, + dataXMM, + TGSI_FILE_INPUT, + idx, + TRUE ); + + switch (format) { + case PIPE_FORMAT_R32_FLOAT: + emit_load_R32(cp, dataXMM, src); + break; + case PIPE_FORMAT_R32G32_FLOAT: + emit_load_R32G32(cp, dataXMM, src); + break; + case PIPE_FORMAT_R32G32B32_FLOAT: + emit_load_R32G32B32(cp, dataXMM, src); + break; + case PIPE_FORMAT_R32G32B32A32_FLOAT: + emit_load_R32G32B32A32(cp, dataXMM, src); + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); + emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); + break; + case PIPE_FORMAT_R8G8B8A8_UNORM: + emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); + break; + default: + ERROR(cp, "unhandled input format"); + return FALSE; + } + + return TRUE; +} + + +boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) +{ + unsigned i; + + for (i = 0; i < cp->vaos->base.vs->info.num_inputs; i++) { + if (!load_input( cp, i, linear )) + return FALSE; + cp->insn_counter++; + debug_printf("\n"); + } + + return TRUE; +} + + + + + + + +static void emit_store_R32G32B32A32( struct aos_compilation *cp, + struct x86_reg dst_ptr, + struct x86_reg dataXMM ) +{ + sse_movups(cp->func, dst_ptr, dataXMM); +} + +static void emit_store_R32G32B32( struct aos_compilation *cp, + struct x86_reg dst_ptr, + struct x86_reg dataXMM ) +{ + sse_movlps(cp->func, dst_ptr, dataXMM); + sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ + sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM); +} + +static void emit_store_R32G32( struct aos_compilation *cp, + struct x86_reg dst_ptr, + struct x86_reg dataXMM ) +{ + sse_movlps(cp->func, dst_ptr, dataXMM); +} + +static void emit_store_R32( struct aos_compilation *cp, + struct x86_reg dst_ptr, + struct x86_reg dataXMM ) +{ + sse_movss(cp->func, dst_ptr, dataXMM); +} + + + +static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp, + struct x86_reg dst_ptr, + struct x86_reg dataXMM ) +{ + sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255)); + sse2_cvtps2dq(cp->func, dataXMM, dataXMM); + sse2_packssdw(cp->func, dataXMM, dataXMM); + sse2_packuswb(cp->func, dataXMM, dataXMM); + sse_movss(cp->func, dst_ptr, dataXMM); +} + + + + + +static boolean emit_output( struct aos_compilation *cp, + struct x86_reg ptr, + struct x86_reg dataXMM, + unsigned format ) +{ + switch (format) { + case PIPE_FORMAT_R32_FLOAT: + emit_store_R32(cp, ptr, dataXMM); + break; + case PIPE_FORMAT_R32G32_FLOAT: + emit_store_R32G32(cp, ptr, dataXMM); + break; + case PIPE_FORMAT_R32G32B32_FLOAT: + emit_store_R32G32B32(cp, ptr, dataXMM); + break; + case PIPE_FORMAT_R32G32B32A32_FLOAT: + emit_store_R32G32B32A32(cp, ptr, dataXMM); + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); + emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); + break; + case PIPE_FORMAT_R8G8B8A8_UNORM: + emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); + break; + default: + ERROR(cp, "unhandled output format"); + return FALSE; + } + + return TRUE; +} + + + +boolean aos_emit_outputs( struct aos_compilation *cp ) +{ + unsigned i; + + for (i = 0; i < cp->vaos->base.vs->info.num_inputs; i++) { + unsigned format = cp->vaos->base.key.element[i].out.format; + unsigned offset = cp->vaos->base.key.element[i].out.offset; + + struct x86_reg data = aos_get_shader_reg( cp, + TGSI_FILE_OUTPUT, + i ); + + if (data.file != file_XMM) { + struct x86_reg tmp = aos_get_xmm_reg( cp ); + sse_movups(cp->func, tmp, data); + data = tmp; + } + + if (!emit_output( cp, + x86_make_disp( cp->outbuf_ECX, offset ), + data, + format )) + return FALSE; + + aos_release_xmm_reg( cp, data.idx ); + + cp->insn_counter++; + debug_printf("\n"); + } + + return TRUE; +} + +#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index df94a7e0c7..0581c3042f 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -157,6 +157,7 @@ draw_create_vs_sse(struct draw_context *draw, vs->base.draw = draw; vs->base.create_varient = draw_vs_varient_generic; +// vs->base.create_varient = draw_vs_varient_aos_sse; vs->base.prepare = vs_sse_prepare; vs->base.run_linear = vs_sse_run_linear; vs->base.delete = vs_sse_delete; -- cgit v1.2.3 From 889473b3f5a216bd753c357974d6bae29fe3c41d Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 20:28:56 +0100 Subject: draw: add viewport to varient state --- .../auxiliary/draw/draw_pt_fetch_shade_emit.c | 9 +++- src/gallium/auxiliary/draw/draw_vs.h | 8 +++- src/gallium/auxiliary/draw/draw_vs_aos.c | 50 ++++++++++++++++++++++ src/gallium/auxiliary/draw/draw_vs_aos.h | 9 +++- src/gallium/auxiliary/draw/draw_vs_sse.c | 4 +- src/gallium/auxiliary/draw/draw_vs_varient.c | 10 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_sse.c') diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index 74945dcfe9..984fbb6767 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -95,10 +95,14 @@ static void fse_prepare( struct draw_pt_middle_end *middle, + fse->key.output_stride = vinfo->size * 4; fse->key.nr_elements = MAX2(num_vs_outputs, /* outputs - translate to hw format */ num_vs_inputs); /* inputs - fetch from api format */ - fse->key.output_stride = vinfo->size * 4; + fse->key.viewport = 1; + fse->key.clip = 0; + fse->key.pad = 0; + memset(fse->key.element, 0, fse->key.nr_elements * sizeof(fse->key.element[0])); @@ -211,6 +215,9 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->active->set_constants( fse->active, (const float (*)[4])draw->pt.user.constants ); + fse->active->set_viewport( fse->active, + &draw->viewport ); + //return TRUE; } diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index 5a8d0da06d..ff3e19b2a8 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -58,7 +58,10 @@ struct draw_vs_element { struct draw_vs_varient_key { unsigned output_stride; - unsigned nr_elements; + unsigned nr_elements:16; + unsigned viewport:1; + unsigned clip:1; + unsigned pad:14; struct draw_vs_element element[PIPE_MAX_ATTRIBS]; }; @@ -88,6 +91,9 @@ struct draw_vs_varient { void (*set_constants)( struct draw_vs_varient *, const float (*constants)[4] ); + void (*set_viewport)( struct draw_vs_varient *, + const struct pipe_viewport_state * ); + void (PIPE_CDECL *run_linear)( struct draw_vs_varient *shader, unsigned start, unsigned count, diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 620f5e3592..b8e66e8b78 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1401,6 +1401,37 @@ emit_instruction( struct aos_compilation *cp, } } + +static boolean emit_viewport( struct aos_compilation *cp ) +{ + struct x86_reg pos = aos_get_shader_reg(cp, + TGSI_FILE_OUTPUT, + 0); + + struct x86_reg scale = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, scale)); + + struct x86_reg translate = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, translate)); + + if (pos.file != file_XMM) { + struct x86_reg dst = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, pos); + pos = dst; + } + + sse_mulps(cp->func, pos, scale); + sse_addps(cp->func, pos, translate); + + aos_adopt_xmm_reg( cp, + pos, + TGSI_FILE_OUTPUT, + 0, + TRUE ); + return TRUE; +} + + static boolean note_immediate( struct aos_compilation *cp, struct tgsi_full_immediate *imm ) { @@ -1540,6 +1571,10 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, if (cp.error) goto fail; + if (cp.vaos->base.key.viewport) { + emit_viewport(&cp); + } + /* Emit output... TODO: do this eagerly after the last write to a * given output. */ @@ -1665,11 +1700,25 @@ static void vaos_set_constants( struct draw_vs_varient *varient, } +static void vaos_set_viewport( struct draw_vs_varient *varient, + const struct pipe_viewport_state *viewport ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + memcpy(vaos->machine->scale, viewport->scale, 4 * sizeof(float)); + memcpy(vaos->machine->translate, viewport->translate, 4 * sizeof(float)); +} + + + static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_varient_key *key ) { struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); + if (key->clip) + return NULL; + if (!vaos) goto fail; @@ -1677,6 +1726,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, vaos->base.vs = vs; vaos->base.set_input = vaos_set_buffer; vaos->base.set_constants = vaos_set_constants; + vaos->base.set_viewport = vaos_set_viewport; vaos->base.destroy = vaos_destroy; vaos->base.run_linear = vaos_run_linear; vaos->base.run_elts = vaos_run_elts; diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index 1d8a055a90..16fef6451c 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -68,8 +68,13 @@ struct aos_machine { float immediate[MAX_IMMEDIATES][4]; /* fixme -- should just be a pointer */ float internal [MAX_INTERNALS ][4]; - unsigned fpu_round_nearest; - unsigned fpu_round_neg_inf; + float scale[4]; /* viewport */ + float translate[4]; /* viewport */ + + ushort fpu_round_nearest; + ushort fpu_round_neg_inf; + ushort fpu_restore; + ushort fpucntl; /* one of FPU_* above */ struct { const void *input_ptr; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index 0581c3042f..7781782ae8 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -156,8 +156,8 @@ draw_create_vs_sse(struct draw_context *draw, tgsi_scan_shader(templ->tokens, &vs->base.info); vs->base.draw = draw; - vs->base.create_varient = draw_vs_varient_generic; -// vs->base.create_varient = draw_vs_varient_aos_sse; + vs->base.create_varient = draw_vs_varient_aos_sse; +// vs->base.create_varient = draw_vs_varient_generic; vs->base.prepare = vs_sse_prepare; vs->base.run_linear = vs_sse_run_linear; vs->base.delete = vs_sse_delete; diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index d27b0f6187..f6f621a748 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -167,6 +167,12 @@ static void vsvg_run_linear( struct draw_vs_varient *varient, + +static void vsvg_set_viewport( struct draw_vs_varient *varient, + const struct pipe_viewport_state *viewport ) +{ +} + static void vsvg_destroy( struct draw_vs_varient *varient ) { FREE(varient); @@ -179,6 +185,9 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, unsigned i; struct translate_key fetch, emit; + if (key->viewport || key->clip) + return NULL; + struct draw_vs_varient_generic *vsvg = CALLOC_STRUCT( draw_vs_varient_generic ); if (vsvg == NULL) return NULL; @@ -187,6 +196,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, vsvg->base.vs = vs; vsvg->base.set_input = vsvg_set_input; vsvg->base.set_constants = vsvg_set_constants; + vsvg->base.set_viewport = vsvg_set_viewport; vsvg->base.run_elts = vsvg_run_elts; vsvg->base.run_linear = vsvg_run_linear; vsvg->base.destroy = vsvg_destroy; -- cgit v1.2.3