diff options
Diffstat (limited to 'src/gallium/auxiliary')
42 files changed, 5026 insertions, 357 deletions
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile index da7eded21f..9a88ecc070 100644 --- a/src/gallium/auxiliary/draw/Makefile +++ b/src/gallium/auxiliary/draw/Makefile @@ -26,12 +26,17 @@ C_SOURCES = \  	draw_pt_emit.c \  	draw_pt_fetch.c \  	draw_pt_fetch_emit.c \ +	draw_pt_fetch_shade_emit.c \  	draw_pt_fetch_shade_pipeline.c \  	draw_pt_post_vs.c \ +        draw_pt_util.c \          draw_pt_varray.c \  	draw_pt_vcache.c \  	draw_vertex.c \  	draw_vs.c \ +	draw_vs_varient.c \ +	draw_vs_aos.c \ +	draw_vs_aos_io.c \  	draw_vs_exec.c \  	draw_vs_llvm.c \  	draw_vs_sse.c  diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 98e23fa830..2242074965 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -56,12 +56,6 @@ struct draw_context *draw_create( void )     draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */ -   tgsi_exec_machine_init(&draw->machine); - -   /* FIXME: give this machine thing a proper constructor: -    */ -   draw->machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); -   draw->machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);     if (!draw_pipeline_init( draw ))        goto fail; @@ -69,6 +63,9 @@ struct draw_context *draw_create( void )     if (!draw_pt_init( draw ))        goto fail; +   if (!draw_vs_init( draw )) +      goto fail; +     return draw;  fail: @@ -83,13 +80,6 @@ void draw_destroy( struct draw_context *draw )        return; -   if (draw->machine.Inputs) -      align_free(draw->machine.Inputs); - -   if (draw->machine.Outputs) -      align_free(draw->machine.Outputs); - -   tgsi_exec_machine_free_data(&draw->machine);     /* Not so fast -- we're just borrowing this at the moment.      *  @@ -99,6 +89,7 @@ void draw_destroy( struct draw_context *draw )     draw_pipeline_destroy( draw );     draw_pt_destroy( draw ); +   draw_vs_destroy( draw );     FREE( draw );  } @@ -295,7 +286,7 @@ int  draw_find_vs_output(struct draw_context *draw,                      uint semantic_name, uint semantic_index)  { -   const struct draw_vertex_shader *vs = draw->vertex_shader; +   const struct draw_vertex_shader *vs = draw->vs.vertex_shader;     uint i;     for (i = 0; i < vs->info.num_outputs; i++) {        if (vs->info.output_semantic_name[i] == semantic_name && @@ -320,7 +311,7 @@ draw_find_vs_output(struct draw_context *draw,  uint  draw_num_vs_outputs(struct draw_context *draw)  { -   uint count = draw->vertex_shader->info.num_outputs; +   uint count = draw->vs.vertex_shader->info.num_outputs;     if (draw->extra_vp_outputs.slot > 0)        count++;     return count; diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c index 46afb0f41f..1d26706dee 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.c +++ b/src/gallium/auxiliary/draw/draw_pipe.c @@ -212,6 +212,71 @@ void draw_pipeline_run( struct draw_context *draw,     draw->pipeline.vertex_count = 0;  } +#define QUAD(i0,i1,i2,i3)                                        \ +   do_triangle( draw,                                            \ +                ( DRAW_PIPE_RESET_STIPPLE |                      \ +                  DRAW_PIPE_EDGE_FLAG_0 |                        \ +                  DRAW_PIPE_EDGE_FLAG_2 ),                       \ +                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \ +                 verts + stride * (i1),                          \ +                verts + stride * (i3));                          \ +      do_triangle( draw,                                         \ +                   ( DRAW_PIPE_EDGE_FLAG_0 |                     \ +                     DRAW_PIPE_EDGE_FLAG_1 ),                    \ +                 verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK), \ +                 verts + stride * (i2),                          \ +                 verts + stride * (i3)) + +#define TRIANGLE(flags,i0,i1,i2)                                 \ +   do_triangle( draw,                                            \ +                flags,  /* flags */                              \ +                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \ +                 verts + stride * (i1),                          \ +                 verts + stride * (i2)) + +#define LINE(flags,i0,i1)                                   \ +   do_line( draw,                                           \ +            flags,                                          \ +            verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \ +            verts + stride * (i+1)) + +#define POINT(i0)                               \ +   do_point( draw,                              \ +             verts + stride * i0 ) + +#define FUNC pipe_run_linear +#define ARGS                                    \ +    struct draw_context *draw,                  \ +    unsigned prim,                              \ +    struct vertex_header *vertices,             \ +    unsigned stride + +#define LOCAL_VARS                                           \ +   char *verts = (char *)vertices;                           \ +   boolean flatfirst = (draw->rasterizer->flatshade &&       \ +                        draw->rasterizer->flatshade_first);  \ +   unsigned i, flags + +#define FLUSH + +#include "draw_pt_decompose.h" + +void draw_pipeline_run_linear( struct draw_context *draw, +                               unsigned prim, +                               struct vertex_header *vertices, +                               unsigned count, +                               unsigned stride ) +{ +   char *verts = (char *)vertices; +   draw->pipeline.verts = verts; +   draw->pipeline.vertex_stride = stride; +   draw->pipeline.vertex_count = count; + +   pipe_run_linear(draw, prim, vertices, stride, count); + +   draw->pipeline.verts = NULL; +   draw->pipeline.vertex_count = 0; +}  void draw_pipeline_flush( struct draw_context *draw,  diff --git a/src/gallium/auxiliary/draw/draw_pipe.h b/src/gallium/auxiliary/draw/draw_pipe.h index f1cb0891ca..dbad8f98ac 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.h +++ b/src/gallium/auxiliary/draw/draw_pipe.h @@ -116,7 +116,7 @@ dup_vert( struct draw_stage *stage,  {        struct vertex_header *tmp = stage->tmp[idx];     const uint vsize = sizeof(struct vertex_header) -      + stage->draw->num_vs_outputs * 4 * sizeof(float); +      + stage->draw->vs.num_vs_outputs * 4 * sizeof(float);     memcpy(tmp, vert, vsize);     tmp->vertex_id = UNDEFINED_VERTEX_ID;     return tmp; diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index b1ed8aa24e..fd48b224b4 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -653,7 +653,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)     }     /* update vertex attrib info */ -   aaline->tex_slot = draw->num_vs_outputs; +   aaline->tex_slot = draw->vs.num_vs_outputs;     assert(aaline->tex_slot > 0); /* output[0] is vertex pos */     /* advertise the extra post-transformed vertex attribute */ diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c index 122a48660a..97d74ad693 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c @@ -681,7 +681,7 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header)     bind_aapoint_fragment_shader(aapoint);     /* update vertex attrib info */ -   aapoint->tex_slot = draw->num_vs_outputs; +   aapoint->tex_slot = draw->vs.num_vs_outputs;     assert(aapoint->tex_slot > 0); /* output[0] is vertex pos */     draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC; @@ -692,7 +692,7 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header)     aapoint->psize_slot = -1;     if (draw->rasterizer->point_size_per_vertex) {        /* find PSIZ vertex output */ -      const struct draw_vertex_shader *vs = draw->vertex_shader; +      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;        uint i;        for (i = 0; i < vs->info.num_outputs; i++) {           if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) { diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index ce80c94163..c11ed934a4 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -112,7 +112,7 @@ static void interp( const struct clipper *clip,  		    const struct vertex_header *out,   		    const struct vertex_header *in )  { -   const unsigned nr_attrs = clip->stage.draw->num_vs_outputs; +   const unsigned nr_attrs = clip->stage.draw->vs.num_vs_outputs;     unsigned j;     /* Vertex header. @@ -180,7 +180,7 @@ static void emit_poly( struct draw_stage *stage,          header.flags |= edge_last;        if (0) { -         const struct draw_vertex_shader *vs = stage->draw->vertex_shader; +         const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;           uint j, k;           debug_printf("Clipped tri:\n");           for (j = 0; j < 3; j++) { @@ -425,7 +425,7 @@ clip_init_state( struct draw_stage *stage )     clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;     if (clipper->flat) { -      const struct draw_vertex_shader *vs = stage->draw->vertex_shader; +      const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;        uint i;        clipper->num_color_attribs = 0; diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c index 09b68c4559..21a9c3b77f 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c +++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c @@ -159,7 +159,7 @@ static void flatshade_line_1( struct draw_stage *stage,  static void flatshade_init_state( struct draw_stage *stage )  {     struct flat_stage *flat = flat_stage(stage); -   const struct draw_vertex_shader *vs = stage->draw->vertex_shader; +   const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;     uint i;     /* Find which vertex shader outputs are colors, make a list */ diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index 3cbced362e..9522b79582 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -71,7 +71,7 @@ screen_interp( struct draw_context *draw,                 const struct vertex_header *v1 )  {     uint attr; -   for (attr = 0; attr < draw->num_vs_outputs; attr++) { +   for (attr = 0; attr < draw->vs.num_vs_outputs; attr++) {        const float *val0 = v0->data[attr];        const float *val1 = v1->data[attr];        float *newv = dst->data[attr]; @@ -175,6 +175,22 @@ reset_stipple_counter(struct draw_stage *stage)     stage->next->reset_stipple_counter( stage->next );  } +static void +stipple_reset_point(struct draw_stage *stage, struct prim_header *header) +{ +   struct stipple_stage *stipple = stipple_stage(stage); +   stipple->counter = 0; +   stage->next->point(stage->next, header); +} + +static void +stipple_reset_tri(struct draw_stage *stage, struct prim_header *header) +{ +   struct stipple_stage *stipple = stipple_stage(stage); +   stipple->counter = 0; +   stage->next->tri(stage->next, header); +} +  static void  stipple_first_line(struct draw_stage *stage,  @@ -220,9 +236,9 @@ struct draw_stage *draw_stipple_stage( struct draw_context *draw )     stipple->stage.draw = draw;     stipple->stage.next = NULL; -   stipple->stage.point = draw_pipe_passthrough_point; +   stipple->stage.point = stipple_reset_point;     stipple->stage.line = stipple_first_line; -   stipple->stage.tri = draw_pipe_passthrough_tri; +   stipple->stage.tri = stipple_reset_tri;     stipple->stage.reset_stipple_counter = reset_stipple_counter;     stipple->stage.flush = stipple_flush;     stipple->stage.destroy = stipple_destroy; diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c index 50872fdbe9..3ac825f565 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c +++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c @@ -105,7 +105,7 @@ static void twoside_first_tri( struct draw_stage *stage,  			       struct prim_header *header )  {     struct twoside_stage *twoside = twoside_stage(stage); -   const struct draw_vertex_shader *vs = stage->draw->vertex_shader; +   const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;     uint i;     twoside->attrib_front0 = 0; diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c index ed08573382..df92e3f2d0 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c +++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c @@ -197,7 +197,7 @@ static void widepoint_first_point( struct draw_stage *stage,     if (draw->rasterizer->point_sprite) {        /* find vertex shader texcoord outputs */ -      const struct draw_vertex_shader *vs = draw->vertex_shader; +      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;        uint i, j = 0;        for (i = 0; i < vs->info.num_outputs; i++) {           if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) { @@ -212,7 +212,7 @@ static void widepoint_first_point( struct draw_stage *stage,     wide->psize_slot = -1;     if (draw->rasterizer->point_size_per_vertex) {        /* find PSIZ vertex output */ -      const struct draw_vertex_shader *vs = draw->vertex_shader; +      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;        uint i;        for (i = 0; i < vs->info.num_outputs; i++) {           if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) { diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index cee58bbf73..c095bf3d7b 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -124,6 +124,7 @@ struct draw_context     struct {        struct {           struct draw_pt_middle_end *fetch_emit; +         struct draw_pt_middle_end *fetch_shade_emit;           struct draw_pt_middle_end *general;        } middle; @@ -154,6 +155,7 @@ struct draw_context           const void *constants;        } user; +      boolean test_fse;     } pt;     struct { @@ -167,13 +169,26 @@ struct draw_context     /* pipe state that we need: */     const struct pipe_rasterizer_state *rasterizer;     struct pipe_viewport_state viewport; +   boolean identity_viewport; -   struct draw_vertex_shader *vertex_shader; +   struct { +      struct draw_vertex_shader *vertex_shader; +      uint num_vs_outputs;  /**< convenience, from vertex_shader */ -   boolean identity_viewport; -   uint num_vs_outputs;  /**< convenience, from vertex_shader */ +      /** TGSI program interpreter runtime state */ +      struct tgsi_exec_machine machine; + +      /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private. +       */ +      struct gallivm_cpu_engine *engine;    + +      struct translate *fetch; +      struct translate_cache *fetch_cache; +      struct translate *emit; +      struct translate_cache *emit_cache; +   } vs;     /* Clip derived state:      */ @@ -190,16 +205,15 @@ struct draw_context     unsigned reduced_prim; -   /** TGSI program interpreter runtime state */ -   struct tgsi_exec_machine machine; - -   /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private. -    */ -   struct gallivm_cpu_engine *engine;        void *driver_private;  }; +/******************************************************************************* + * Vertex shader code: + */ +boolean draw_vs_init( struct draw_context *draw ); +void draw_vs_destroy( struct draw_context *draw ); @@ -247,6 +261,12 @@ void draw_pipeline_run( struct draw_context *draw,                          const ushort *elts,                          unsigned count ); +void draw_pipeline_run_linear( struct draw_context *draw, +                               unsigned prim, +                               struct vertex_header *vertices, +                               unsigned count, +                               unsigned stride ); +  void draw_pipeline_flush( struct draw_context *draw,  diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index c9c5d18313..75f44d503e 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -64,7 +64,7 @@ draw_pt_arrays(struct draw_context *draw,        opt |= PT_PIPELINE;     } -   if (!draw->bypass_clipping) { +   if (!draw->bypass_clipping && !draw->pt.test_fse) {        opt |= PT_CLIPTEST;     } @@ -72,16 +72,18 @@ draw_pt_arrays(struct draw_context *draw,        opt |= PT_SHADE;     } -   if (opt) -      middle = draw->pt.middle.general; -   else + +   if (opt == 0)         middle = draw->pt.middle.fetch_emit; +   else if (opt == PT_SHADE && draw->pt.test_fse) +      middle = draw->pt.middle.fetch_shade_emit; +   else +      middle = draw->pt.middle.general;     /* Pick the right frontend      */ -   if (draw->pt.user.elts || -       count >= 256) { +   if (draw->pt.user.elts || (opt & PT_PIPELINE)) {        frontend = draw->pt.front.vcache;     } else {        frontend = draw->pt.front.varray; @@ -102,6 +104,8 @@ draw_pt_arrays(struct draw_context *draw,  boolean draw_pt_init( struct draw_context *draw )  { +   draw->pt.test_fse = GETENV("DRAW_FSE") != NULL; +     draw->pt.front.vcache = draw_pt_vcache( draw );     if (!draw->pt.front.vcache)        return FALSE; @@ -114,6 +118,13 @@ boolean draw_pt_init( struct draw_context *draw )     if (!draw->pt.middle.fetch_emit)        return FALSE; +   if (draw->pt.test_fse) { +      draw->pt.middle.fetch_shade_emit = draw_pt_middle_fse( draw ); +      if (!draw->pt.middle.fetch_shade_emit) +         return FALSE; +   } + +     draw->pt.middle.general = draw_pt_fetch_pipeline_or_emit( draw );     if (!draw->pt.middle.general)        return FALSE; @@ -134,6 +145,11 @@ void draw_pt_destroy( struct draw_context *draw )        draw->pt.middle.fetch_emit = NULL;     } +   if (draw->pt.middle.fetch_shade_emit) { +      draw->pt.middle.fetch_shade_emit->destroy( draw->pt.middle.fetch_shade_emit ); +      draw->pt.middle.fetch_shade_emit = NULL; +   } +     if (draw->pt.front.vcache) {        draw->pt.front.vcache->destroy( draw->pt.front.vcache );        draw->pt.front.vcache = NULL; @@ -147,19 +163,6 @@ void draw_pt_destroy( struct draw_context *draw ) -static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = { -   PIPE_PRIM_POINTS, -   PIPE_PRIM_LINES, -   PIPE_PRIM_LINES, -   PIPE_PRIM_LINES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES -}; -  /**   * Draw vertex arrays @@ -172,9 +175,10 @@ void  draw_arrays(struct draw_context *draw, unsigned prim,              unsigned start, unsigned count)  { -   if (reduced_prim[prim] != draw->reduced_prim) { +   unsigned reduced_prim = draw_pt_reduced_prim(prim); +   if (reduced_prim != draw->reduced_prim) {        draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); -      draw->reduced_prim = reduced_prim[prim]; +      draw->reduced_prim = reduced_prim;     }     /* drawing done here: */ diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h index 2dec376cee..e03816ebbc 100644 --- a/src/gallium/auxiliary/draw/draw_pt.h +++ b/src/gallium/auxiliary/draw/draw_pt.h @@ -92,6 +92,10 @@ struct draw_pt_middle_end {                  const ushort *draw_elts,                  unsigned draw_count ); +   void (*run_linear)(struct draw_pt_middle_end *, +                      unsigned start, +                      unsigned count); +     void (*finish)( struct draw_pt_middle_end * );     void (*destroy)( struct draw_pt_middle_end * );  }; @@ -117,6 +121,7 @@ const void *draw_pt_elt_ptr( struct draw_context *draw,  struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw );  struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw); +  /* Middle-ends:   *   * Currently one general-purpose case which can do all possibilities, @@ -128,6 +133,7 @@ struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);   * vertex_elements.   */  struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw ); +struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw );  struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *draw); @@ -152,6 +158,13 @@ void draw_pt_emit( struct pt_emit *emit,  		   const ushort *elts,  		   unsigned count ); +void draw_pt_emit_linear( struct pt_emit *emit, +                          const float (*vertex_data)[4], +                          unsigned vertex_count, +                          unsigned stride, +                          unsigned start, +                          unsigned count ); +  void draw_pt_emit_destroy( struct pt_emit *emit );  struct pt_emit *draw_pt_emit_create( struct draw_context *draw ); @@ -170,6 +183,11 @@ void draw_pt_fetch_run( struct pt_fetch *fetch,  			unsigned count,  			char *verts ); +void draw_pt_fetch_run_linear( struct pt_fetch *fetch, +                               unsigned start, +                               unsigned count, +                               char *verts ); +  void draw_pt_fetch_destroy( struct pt_fetch *fetch );  struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw ); @@ -194,4 +212,11 @@ struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw );  void draw_pt_post_vs_destroy( struct pt_post_vs *pvs ); +/******************************************************************************* + * Utils:  + */ +void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr); +unsigned draw_pt_reduced_prim(unsigned prim); + +  #endif diff --git a/src/gallium/auxiliary/draw/draw_pt_decompose.h b/src/gallium/auxiliary/draw/draw_pt_decompose.h new file mode 100644 index 0000000000..dccfde99dd --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_decompose.h @@ -0,0 +1,153 @@ + + +static void FUNC( ARGS, +                  unsigned count ) +{ +   LOCAL_VARS; + +   switch (prim) { +   case PIPE_PRIM_POINTS: +      for (i = 0; i < count; i ++) { +	 POINT( (i + 0) ); +      } +      break; + +   case PIPE_PRIM_LINES: +      for (i = 0; i+1 < count; i += 2) { +         LINE( DRAW_PIPE_RESET_STIPPLE, +               (i + 0), +               (i + 1)); +      } +      break; + +   case PIPE_PRIM_LINE_LOOP: +      if (count >= 2) { +         flags = DRAW_PIPE_RESET_STIPPLE; + +         for (i = 1; i < count; i++, flags = 0) { +            LINE( flags, +                  (i - 1), +                  (i )); +         } + +	 LINE( flags, +               (i - 1), +               (0 )); +      } +      break; + +   case PIPE_PRIM_LINE_STRIP: +      flags = DRAW_PIPE_RESET_STIPPLE; +      for (i = 1; i < count; i++, flags = 0) { +         LINE( flags, +               (i - 1), +               (i )); +      } +      break; + +   case PIPE_PRIM_TRIANGLES: +      for (i = 0; i+2 < count; i += 3) { +         TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                   (i + 0), +                   (i + 1), +                   (i + 2 )); +      } +      break; + +   case PIPE_PRIM_TRIANGLE_STRIP: +      if (flatfirst) { +         for (i = 0; i+2 < count; i++) { +            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                      (i + 0), +                      (i + 1 + (i&1)), +                      (i + 2 - (i&1))); +         } +      } +      else { +         for (i = 0; i+2 < count; i++) { +            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                      (i + 0 + (i&1)), +                      (i + 1 - (i&1)), +                      (i + 2 )); +         } +      } +      break; + +   case PIPE_PRIM_TRIANGLE_FAN: +      if (count >= 3) { +         if (flatfirst) { +            for (i = 0; i+2 < count; i++) { +               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                         (i + 1), +                         (i + 2), +                         (0 )); +            } +         } +         else { +            for (i = 0; i+2 < count; i++) { +               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                         (0), +                         (i + 1), +                         (i + 2 )); +            } +         } +      } +      break; + + +   case PIPE_PRIM_QUADS: +      for (i = 0; i+3 < count; i += 4) { +         QUAD( (i + 0), +               (i + 1), +               (i + 2), +               (i + 3)); +      } +      break; + +   case PIPE_PRIM_QUAD_STRIP: +      for (i = 0; i+3 < count; i += 2) { +         QUAD( (i + 2), +               (i + 0), +               (i + 1), +               (i + 3)); +      } +      break; + +   case PIPE_PRIM_POLYGON: +      { +         /* These bitflags look a little odd because we submit the +          * vertices as (1,2,0) to satisfy flatshade requirements. +          */ +         const unsigned edge_first  = DRAW_PIPE_EDGE_FLAG_2; +         const unsigned edge_middle = DRAW_PIPE_EDGE_FLAG_0; +         const unsigned edge_last   = DRAW_PIPE_EDGE_FLAG_1; + +         flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; + +	 for (i = 0; i+2 < count; i++, flags = edge_middle) { + +            if (i + 3 == count) +               flags |= edge_last; + +	    TRIANGLE( flags, +                      (i + 1), +                      (i + 2), +                      (0)); +	 } +      } +      break; + +   default: +      assert(0); +      break; +   } + +   FLUSH; +} + + +#undef TRIANGLE +#undef QUAD +#undef POINT +#undef LINE +#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index ce3a153f64..cf87cde996 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -40,6 +40,9 @@ struct pt_emit {     struct translate *translate;     struct translate_cache *cache; +   unsigned prim; + +   const struct vertex_info *vinfo;  };  void draw_pt_emit_prepare( struct pt_emit *emit, @@ -51,8 +54,18 @@ void draw_pt_emit_prepare( struct pt_emit *emit,     struct translate_key hw_key;     unsigned i;     boolean ok; +    +   /* XXX: need to flush to get prim_vbuf.c to release its allocation??  +    */ +   draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + -   ok = draw->render->set_primitive(draw->render, prim); +   /* XXX: may need to defensively reset this later on as clipping can +    * clobber this state in the render backend. +    */ +   emit->prim = prim; + +   ok = draw->render->set_primitive(draw->render, emit->prim);     if (!ok) {        assert(0);        return; @@ -60,7 +73,7 @@ void draw_pt_emit_prepare( struct pt_emit *emit,     /* Must do this after set_primitive() above:      */ -   vinfo = draw->render->get_vertex_info(draw->render); +   emit->vinfo = vinfo = draw->render->get_vertex_info(draw->render);     /* Translate from pipeline vertices to hw vertices. @@ -100,6 +113,7 @@ void draw_pt_emit_prepare( struct pt_emit *emit,        case EMIT_4UB:  	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;  	 emit_sz = 4 * sizeof(ubyte); +         break;        default:  	 assert(0);  	 output_format = PIPE_FORMAT_NONE; @@ -144,6 +158,14 @@ void draw_pt_emit( struct pt_emit *emit,      */     draw_do_flush( draw, DRAW_FLUSH_BACKEND ); +   /* XXX: and work out some way to coordinate the render primitive +    * between vbuf.c and here... +    */ +   if (!draw->render->set_primitive(draw->render, emit->prim)) { +      assert(0); +      return; +   } +     hw_verts = render->allocate_vertices(render,  					(ushort)translate->key.output_stride,  					(ushort)vertex_count); @@ -178,6 +200,72 @@ void draw_pt_emit( struct pt_emit *emit,  } +void draw_pt_emit_linear(struct pt_emit *emit, +                         const float (*vertex_data)[4], +                         unsigned vertex_count, +                         unsigned stride, +                         unsigned start, +                         unsigned count) +{ +   struct draw_context *draw = emit->draw; +   struct translate *translate = emit->translate; +   struct vbuf_render *render = draw->render; +   void *hw_verts; + +#if 0 +   debug_printf("Linear emit\n"); +#endif +   /* XXX: need to flush to get prim_vbuf.c to release its allocation??  +    */ +   draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + +   /* XXX: and work out some way to coordinate the render primitive +    * between vbuf.c and here... +    */ +   if (!draw->render->set_primitive(draw->render, emit->prim)) { +      assert(0); +      return; +   } + +   hw_verts = render->allocate_vertices(render, +					(ushort)translate->key.output_stride, +					(ushort)count); +   if (!hw_verts) { +      assert(0); +      return; +   } + +   translate->set_buffer(translate, 0, +			 vertex_data, stride); + +   translate->set_buffer(translate, 1, +			 &draw->rasterizer->point_size, +			 0); + +   translate->run(translate, +                  0, +                  vertex_count, +                  hw_verts); + +   if (0) { +      unsigned i; +      for (i = 0; i < vertex_count; i++) { +         debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i); +         draw_dump_emitted_vertex( emit->vinfo,  +                                   (const uint8_t *)hw_verts +  +                                   translate->key.output_stride * i ); +      } +   } + + +   render->draw_arrays(render, start, count); + +   render->release_vertices(render, +			    hw_verts, +			    translate->key.output_stride, +			    vertex_count); +} +  struct pt_emit *draw_pt_emit_create( struct draw_context *draw )  {     struct pt_emit *emit = CALLOC_STRUCT(pt_emit); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c index b96335b789..07f4c99164 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c @@ -166,6 +166,42 @@ void draw_pt_fetch_run( struct pt_fetch *fetch,  } +void draw_pt_fetch_run_linear( struct pt_fetch *fetch, +                               unsigned start, +                               unsigned count, +                               char *verts ) +{ +   struct draw_context *draw = fetch->draw; +   struct translate *translate = fetch->translate; +   unsigned i; + +   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) { +      translate->set_buffer(translate, +			    i, +			    ((char *)draw->pt.user.vbuffer[i] + +			     draw->pt.vertex_buffer[i].buffer_offset), +			    draw->pt.vertex_buffer[i].pitch ); +   } + +   translate->run( translate, +                   start, +                   count, +                   verts ); + +   /* Edgeflags are hard to fit into a translate program, populate +    * them separately if required.  In the setup above they are +    * defaulted to one, so only need this if there is reason to change +    * that default: +    */ +   if (fetch->need_edgeflags) { +      for (i = 0; i < count; i++) { +         struct vertex_header *vh = (struct vertex_header *)(verts + i * fetch->vertex_size); +         vh->edgeflag = draw_pt_get_edgeflag( draw, start + i ); +      } +   } +} + +  struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw )  {     struct pt_fetch *fetch = CALLOC_STRUCT(pt_fetch); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index 4ea7d4359f..a1d041a74f 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -258,6 +258,59 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,  } +static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, +                                   unsigned start, +                                   unsigned count ) +{ +   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; +   struct draw_context *draw = feme->draw; +   void *hw_verts; + +   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? +    */ +   draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + +   hw_verts = draw->render->allocate_vertices( draw->render, +                                               (ushort)feme->translate->key.output_stride, +                                               (ushort)count ); +   if (!hw_verts) { +      assert(0); +      return; +   } + +   /* Single routine to fetch vertices and emit HW verts. +    */ +   feme->translate->run( feme->translate, +                         start, +                         count, +                         hw_verts ); + +   if (0) { +      unsigned i; +      for (i = 0; i < count; i++) { +         debug_printf("\n\nvertex %d:\n", i); +         draw_dump_emitted_vertex( feme->vinfo, +                                   (const uint8_t *)hw_verts + feme->vinfo->size * 4 * i ); +      } +   } + +   /* XXX: Draw arrays path to avoid re-emitting index list again and +    * again. +    */ +   draw->render->draw_arrays( draw->render, +                              0, /*start*/ +                              count ); + +   /* Done -- that was easy, wasn't it: +    */ +   draw->render->release_vertices( draw->render, +                                   hw_verts, +                                   feme->translate->key.output_stride, +                                   count ); + +} + +  static void fetch_emit_finish( struct draw_pt_middle_end *middle )  { @@ -287,10 +340,11 @@ struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw )        return NULL;     } -   fetch_emit->base.prepare = fetch_emit_prepare; -   fetch_emit->base.run     = fetch_emit_run; -   fetch_emit->base.finish  = fetch_emit_finish; -   fetch_emit->base.destroy = fetch_emit_destroy; +   fetch_emit->base.prepare    = fetch_emit_prepare; +   fetch_emit->base.run        = fetch_emit_run; +   fetch_emit->base.run_linear = fetch_emit_run_linear; +   fetch_emit->base.finish     = fetch_emit_finish; +   fetch_emit->base.destroy    = fetch_emit_destroy;     fetch_emit->draw = draw; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c new file mode 100644 index 0000000000..729c7db999 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -0,0 +1,344 @@ +/************************************************************************** + *  + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + /* +  * Authors: +  *   Keith Whitwell <keith@tungstengraphics.com> +  */ + + +#include "pipe/p_util.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_vbuf.h" +#include "draw/draw_vertex.h" +#include "draw/draw_pt.h" +#include "draw/draw_vs.h" + +#include "translate/translate.h" + +struct fetch_shade_emit; + + +/* Prototype fetch, shade, emit-hw-verts all in one go. + */ +struct fetch_shade_emit { +   struct draw_pt_middle_end base; +   struct draw_context *draw; + + +   /* Temporaries: +    */ +   const float *constants; +   unsigned pitch[PIPE_MAX_ATTRIBS]; +   const ubyte *src[PIPE_MAX_ATTRIBS]; +   unsigned prim; + +   struct draw_vs_varient_key key; +   struct draw_vs_varient *active; + + +   const struct vertex_info *vinfo; +}; + + + +			        +static void fse_prepare( struct draw_pt_middle_end *middle, +                         unsigned prim,  +                         unsigned opt ) +{ +   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; +   struct draw_context *draw = fse->draw; +   unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs; +   const struct vertex_info *vinfo; +   unsigned i; +    + +   if (!draw->render->set_primitive( draw->render,  +                                     prim )) { +      assert(0); +      return; +   } + +   /* Must do this after set_primitive() above: +    */ +   fse->vinfo = vinfo = draw->render->get_vertex_info(draw->render); +    + + +   fse->key.output_stride = vinfo->size * 4; +   fse->key.nr_outputs = vinfo->num_attribs; +   fse->key.nr_inputs = num_vs_inputs; + +   fse->key.nr_elements = MAX2(fse->key.nr_outputs,     /* outputs - translate to hw format */ +                               fse->key.nr_inputs);     /* inputs - fetch from api format */ + +   fse->key.viewport = !draw->identity_viewport; +   fse->key.clip = !draw->bypass_clipping; +   fse->key.pad = 0; + +   memset(fse->key.element, 0,  +          fse->key.nr_elements * sizeof(fse->key.element[0])); + +   for (i = 0; i < num_vs_inputs; i++) { +      const struct pipe_vertex_element *src = &draw->pt.vertex_element[i]; +      fse->key.element[i].in.format = src->src_format; + +      /* Consider ignoring these, ie make generated programs +       * independent of this state: +       */ +      fse->key.element[i].in.buffer = src->vertex_buffer_index; +      fse->key.element[i].in.offset = src->src_offset; +   } +    + +   { +      unsigned dst_offset = 0; + +      for (i = 0; i < vinfo->num_attribs; i++) { +         unsigned emit_sz = 0; + +         switch (vinfo->emit[i]) { +         case EMIT_4F: +            emit_sz = 4 * sizeof(float); +            break; +         case EMIT_3F: +            emit_sz = 3 * sizeof(float); +            break; +         case EMIT_2F: +            emit_sz = 2 * sizeof(float); +            break; +         case EMIT_1F: +            emit_sz = 1 * sizeof(float); +            break; +         case EMIT_1F_PSIZE: +            emit_sz = 1 * sizeof(float); +            break; +         case EMIT_4UB: +            emit_sz = 4 * sizeof(ubyte); +            break; +         default: +            assert(0); +            break; +         } + +         /* The elements in the key correspond to vertex shader output +          * numbers, not to positions in the hw vertex description -- +          * that's handled by the output_offset field. +          */ +         fse->key.element[i].out.format = vinfo->emit[i]; +         fse->key.element[i].out.vs_output = vinfo->src_index[i]; +         fse->key.element[i].out.offset = dst_offset; +       +         dst_offset += emit_sz; +         assert(fse->key.output_stride >= dst_offset); +      } +   } + + +   /* Would normally look up a vertex shader and peruse its list of +    * varients somehow.  We omitted that step and put all the +    * hardcoded "shaders" into an array.  We're just making the +    * assumption that this happens to be a matching shader...  ie +    * you're running isosurf, aren't you? +    */ +   fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader,  +                                         &fse->key ); + +   if (!fse->active) { +      assert(0); +      return ; +   } + +   /* Now set buffer pointers: +    */ +   for (i = 0; i < num_vs_inputs; i++) { +      unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index; + +      fse->active->set_input( fse->active,  +                              i,  +                               +                              ((const ubyte *) draw->pt.user.vbuffer[buf] +  +                               draw->pt.vertex_buffer[buf].buffer_offset), +                               +                              draw->pt.vertex_buffer[buf].pitch ); +   } + +   fse->active->set_constants( fse->active, +                               (const float (*)[4])draw->pt.user.constants ); + +   fse->active->set_viewport( fse->active, +                              &draw->viewport ); + +   //return TRUE; +} + + + + + + + +static void fse_run_linear( struct draw_pt_middle_end *middle,  +                            unsigned start,  +                            unsigned count ) +{ +   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; +   struct draw_context *draw = fse->draw; +   unsigned alloc_count = align(count, 4); +   char *hw_verts; + +   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? +    */ +   draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + +   hw_verts = draw->render->allocate_vertices( draw->render, +                                               (ushort)fse->key.output_stride, +                                               (ushort)alloc_count ); + +   if (!hw_verts) { +      assert(0); +      return; +   } + +   /* Single routine to fetch vertices, run shader and emit HW verts. +    * Clipping is done elsewhere -- either by the API or on hardware, +    * or for some other reason not required... +    */ +   fse->active->run_linear( fse->active,  +                            start, count, +                            hw_verts ); + +   /* Draw arrays path to avoid re-emitting index list again and +    * again. +    */ +   draw->render->draw_arrays( draw->render, +                              0, +                              count ); +    +   if (0) { +      unsigned i; +      for (i = 0; i < count; i++) { +         debug_printf("\n\n%s vertex %d: (stride %d, offset %d)\n", __FUNCTION__, i, +                      fse->key.output_stride, +                      fse->key.output_stride * i); + +         draw_dump_emitted_vertex( fse->vinfo,  +                                   (const uint8_t *)hw_verts + fse->key.output_stride * i ); +      } +   } + + +   draw->render->release_vertices( draw->render,  +				   hw_verts,  +				   fse->key.output_stride,  +				   count ); +} + + +static void +fse_run(struct draw_pt_middle_end *middle, +        const unsigned *fetch_elts, +        unsigned fetch_count, +        const ushort *draw_elts, +        unsigned draw_count ) +{ +   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; +   struct draw_context *draw = fse->draw; +   unsigned alloc_count = align(fetch_count, 4); +   void *hw_verts; +    +   /* XXX: need to flush to get prim_vbuf.c to release its allocation??  +    */ +   draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + +   hw_verts = draw->render->allocate_vertices( draw->render, +                                               (ushort)fse->key.output_stride, +                                               (ushort)alloc_count ); +   if (!hw_verts) { +      assert(0); +      return; +   } +          +					 +   /* Single routine to fetch vertices, run shader and emit HW verts. +    */ +   fse->active->run_elts( fse->active,  +                          fetch_elts, +                          fetch_count, +                          hw_verts ); + +   draw->render->draw( draw->render,  +                       draw_elts,  +                       draw_count ); + +   if (0) { +      unsigned i; +      for (i = 0; i < fetch_count; i++) { +         debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i); +         draw_dump_emitted_vertex( fse->vinfo,  +                                   (const uint8_t *)hw_verts +  +                                   fse->key.output_stride * i ); +      } +   } + + +   draw->render->release_vertices( draw->render,  +                                   hw_verts,  +                                   fse->key.output_stride,  +                                   fetch_count ); + +} + + +static void fse_finish( struct draw_pt_middle_end *middle ) +{ +} + + +static void +fse_destroy( struct draw_pt_middle_end *middle )  +{ +   FREE(middle); +} + +struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw ) +{ +   struct fetch_shade_emit *fse = CALLOC_STRUCT(fetch_shade_emit); +   if (!fse) +      return NULL; + +   fse->base.prepare = fse_prepare; +   fse->base.run = fse_run; +   fse->base.run_linear = fse_run_linear; +   fse->base.finish = fse_finish; +   fse->base.destroy = fse_destroy; +   fse->draw = draw; + +   return &fse->base; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index 4ec20493c4..06718779a5 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -55,7 +55,7 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,  {     struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;     struct draw_context *draw = fpme->draw; -   struct draw_vertex_shader *vs = draw->vertex_shader; +   struct draw_vertex_shader *vs = draw->vs.vertex_shader;     /* Add one to num_outputs because the pipeline occasionally tags on      * an additional texcoord, eg for AA lines. @@ -107,7 +107,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,  {     struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;     struct draw_context *draw = fpme->draw; -   struct draw_vertex_shader *shader = draw->vertex_shader; +   struct draw_vertex_shader *shader = draw->vs.vertex_shader;     unsigned opt = fpme->opt;     unsigned alloc_count = align_int( fetch_count, 4 ); @@ -162,7 +162,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,                           fpme->vertex_size,                           draw_elts,                           draw_count ); -   }  +   }     else {        draw_pt_emit( fpme->emit,  		    (const float (*)[4])pipeline_verts->data, @@ -177,6 +177,79 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,  } +static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, +                                       unsigned start, +                                       unsigned count) +{ +   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; +   struct draw_context *draw = fpme->draw; +   struct draw_vertex_shader *shader = draw->vs.vertex_shader; +   unsigned opt = fpme->opt; +   unsigned alloc_count = align_int( count, 4 ); + +   struct vertex_header *pipeline_verts = +      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count); + +   if (!pipeline_verts) { +      /* Not much we can do here - just skip the rendering. +       */ +      assert(0); +      return; +   } + +   /* Fetch into our vertex buffer +    */ +   draw_pt_fetch_run_linear( fpme->fetch, +                             start, +                             count, +                             (char *)pipeline_verts ); + +   /* Run the shader, note that this overwrites the data[] parts of +    * the pipeline verts.  If there is no shader, ie a bypass shader, +    * then the inputs == outputs, and are already in the correct +    * place. +    */ +   if (opt & PT_SHADE) +   { +      shader->run_linear(shader, +			 (const float (*)[4])pipeline_verts->data, +			 (      float (*)[4])pipeline_verts->data, +			 (const float (*)[4])draw->pt.user.constants, +			 count, +			 fpme->vertex_size, +			 fpme->vertex_size); +   } + +   if (draw_pt_post_vs_run( fpme->post_vs, +			    pipeline_verts, +			    count, +			    fpme->vertex_size )) +   { +      opt |= PT_PIPELINE; +   } + +   /* Do we need to run the pipeline? +    */ +   if (opt & PT_PIPELINE) { +      draw_pipeline_run_linear( fpme->draw, +                                fpme->prim, +                                pipeline_verts, +                                count, +                                fpme->vertex_size); +   } +   else { +      draw_pt_emit_linear( fpme->emit, +                           (const float (*)[4])pipeline_verts->data, +                           count, +                           fpme->vertex_size, +                           0, /*start*/ +                           count ); +   } + +   FREE(pipeline_verts); +} + +  static void fetch_pipeline_finish( struct draw_pt_middle_end *middle )  { @@ -206,10 +279,11 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context *     if (!fpme)        goto fail; -   fpme->base.prepare = fetch_pipeline_prepare; -   fpme->base.run     = fetch_pipeline_run; -   fpme->base.finish  = fetch_pipeline_finish; -   fpme->base.destroy = fetch_pipeline_destroy; +   fpme->base.prepare        = fetch_pipeline_prepare; +   fpme->base.run            = fetch_pipeline_run; +   fpme->base.run_linear     = fetch_pipeline_linear_run; +   fpme->base.finish         = fetch_pipeline_finish; +   fpme->base.destroy        = fetch_pipeline_destroy;     fpme->draw = draw; diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c new file mode 100644 index 0000000000..32c8a9632c --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_util.c @@ -0,0 +1,103 @@ +/************************************************************************** + *  + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + /* +  * Authors: +  *   Keith Whitwell <keith@tungstengraphics.com> +  */ + +#include "pipe/p_util.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_pt.h" + +void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) +{ +   switch (prim) { +   case PIPE_PRIM_POINTS: +      *first = 1; +      *incr = 1; +      break; +   case PIPE_PRIM_LINES: +      *first = 2; +      *incr = 2; +      break; +   case PIPE_PRIM_LINE_STRIP: +   case PIPE_PRIM_LINE_LOOP: +      *first = 2; +      *incr = 1; +      break; +   case PIPE_PRIM_TRIANGLES: +      *first = 3; +      *incr = 3; +      break; +   case PIPE_PRIM_TRIANGLE_STRIP: +   case PIPE_PRIM_TRIANGLE_FAN: +   case PIPE_PRIM_POLYGON: +      *first = 3; +      *incr = 1; +      break; +   case PIPE_PRIM_QUADS: +      *first = 4; +      *incr = 4; +      break; +   case PIPE_PRIM_QUAD_STRIP: +      *first = 4; +      *incr = 2; +      break; +   default: +      assert(0); +      *first = 0; +      *incr = 1;		/* set to one so that count % incr works */ +      break; +   } +} + + +unsigned draw_pt_reduced_prim(unsigned prim) +{ +   switch (prim) { +   case PIPE_PRIM_POINTS: +      return PIPE_PRIM_POINTS; +   case PIPE_PRIM_LINES: +   case PIPE_PRIM_LINE_STRIP: +   case PIPE_PRIM_LINE_LOOP: +      return PIPE_PRIM_LINES; +   case PIPE_PRIM_TRIANGLES: +   case PIPE_PRIM_TRIANGLE_STRIP: +   case PIPE_PRIM_TRIANGLE_FAN: +   case PIPE_PRIM_POLYGON: +   case PIPE_PRIM_QUADS: +   case PIPE_PRIM_QUAD_STRIP: +      return PIPE_PRIM_TRIANGLES; +   default: +      assert(0); +      return PIPE_PRIM_POINTS; +   } +} + + diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c index 355093f945..260f28f284 100644 --- a/src/gallium/auxiliary/draw/draw_pt_varray.c +++ b/src/gallium/auxiliary/draw/draw_pt_varray.c @@ -43,6 +43,8 @@ struct varray_frontend {     unsigned draw_count;     unsigned fetch_count; +   unsigned fetch_start; +     struct draw_pt_middle_end *middle;     unsigned input_prim; @@ -56,6 +58,11 @@ static void varray_flush(struct varray_frontend *varray)        debug_printf("FLUSH fc = %d, dc = %d\n",                     varray->fetch_count,                     varray->draw_count); +      debug_printf("\telt0 = %d, eltx = %d, draw0 = %d, drawx = %d\n", +                   varray->fetch_elts[0], +                   varray->fetch_elts[varray->fetch_count-1], +                   varray->draw_elts[0], +                   varray->draw_elts[varray->draw_count-1]);  #endif        varray->middle->run(varray->middle,                            varray->fetch_elts, @@ -68,20 +75,43 @@ static void varray_flush(struct varray_frontend *varray)     varray->draw_count = 0;  } -#if 0 -static void varray_check_flush(struct varray_frontend *varray) +static void varray_flush_linear(struct varray_frontend *varray, +                                unsigned start, unsigned count)  { -   if (varray->draw_count + 6 >= DRAW_MAX/* || -       varray->fetch_count + 4 >= FETCH_MAX*/) { -      varray_flush(varray); +   if (count) { +#if 0 +      debug_printf("FLUSH LINEAR start = %d, count = %d\n", +                   start, +                   count); +#endif +      assert(varray->middle->run_linear); +      varray->middle->run_linear(varray->middle, start, count);     }  } + +static INLINE void fetch_init(struct varray_frontend *varray, +                              unsigned count) +{ +   unsigned idx; +#if 0 +      debug_printf("FETCH INIT c = %d, fs = %d\n", +                   count, +                   varray->fetch_start);  #endif +   for (idx = 0; idx < count; ++idx) { +      varray->fetch_elts[idx] = varray->fetch_start + idx; +   } +   varray->fetch_start += idx; +   varray->fetch_count = idx; +} + + +  static INLINE void add_draw_el(struct varray_frontend *varray, -                               int idx, ushort flags) +                               int idx)  { -   varray->draw_elts[varray->draw_count++] = idx | flags; +   varray->draw_elts[varray->draw_count++] = idx;  } @@ -90,106 +120,52 @@ static INLINE void varray_triangle( struct varray_frontend *varray,                                      unsigned i1,                                      unsigned i2 )  { -   add_draw_el(varray, i0, 0); -   add_draw_el(varray, i1, 0); -   add_draw_el(varray, i2, 0); -} - -static INLINE void varray_triangle_flags( struct varray_frontend *varray, -                                          ushort flags, -                                          unsigned i0, -                                          unsigned i1, -                                          unsigned i2 ) -{ -   add_draw_el(varray, i0, flags); -   add_draw_el(varray, i1, 0); -   add_draw_el(varray, i2, 0); +   add_draw_el(varray, i0); +   add_draw_el(varray, i1); +   add_draw_el(varray, i2);  }  static INLINE void varray_line( struct varray_frontend *varray,                                  unsigned i0,                                  unsigned i1 )  { -   add_draw_el(varray, i0, 0); -   add_draw_el(varray, i1, 0); -} - - -static INLINE void varray_line_flags( struct varray_frontend *varray, -                                      ushort flags, -                                      unsigned i0, -                                      unsigned i1 ) -{ -   add_draw_el(varray, i0, flags); -   add_draw_el(varray, i1, 0); +   add_draw_el(varray, i0); +   add_draw_el(varray, i1);  }  static INLINE void varray_point( struct varray_frontend *varray,                                   unsigned i0 )  { -   add_draw_el(varray, i0, 0); -} - -static INLINE void varray_quad( struct varray_frontend *varray, -                                unsigned i0, -                                unsigned i1, -                                unsigned i2, -                                unsigned i3 ) -{ -   varray_triangle( varray, i0, i1, i3 ); -   varray_triangle( varray, i1, i2, i3 ); -} - -static INLINE void varray_ef_quad( struct varray_frontend *varray, -                                   unsigned i0, -                                   unsigned i1, -                                   unsigned i2, -                                   unsigned i3 ) -{ -   const unsigned omitEdge1 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_2; -   const unsigned omitEdge2 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1; - -   varray_triangle_flags( varray, -                          DRAW_PIPE_RESET_STIPPLE | omitEdge1, -                          i0, i1, i3 ); - -   varray_triangle_flags( varray, -                          omitEdge2, -                          i1, i2, i3 ); +   add_draw_el(varray, i0);  } -/* At least for now, we're back to using a template include file for - * this.  The two paths aren't too different though - it may be - * possible to reunify them. - */ -#define TRIANGLE(vc,flags,i0,i1,i2) varray_triangle_flags(vc,flags,i0,i1,i2) -#define QUAD(vc,i0,i1,i2,i3)        varray_ef_quad(vc,i0,i1,i2,i3) -#define LINE(vc,flags,i0,i1)        varray_line_flags(vc,flags,i0,i1) -#define POINT(vc,i0)                varray_point(vc,i0) -#define FUNC varray_run_extras -#include "draw_pt_varray_tmp.h" -#define TRIANGLE(vc,flags,i0,i1,i2) varray_triangle(vc,i0,i1,i2) -#define QUAD(vc,i0,i1,i2,i3)        varray_quad(vc,i0,i1,i2,i3) -#define LINE(vc,flags,i0,i1)        varray_line(vc,i0,i1) +#if 0 +#define TRIANGLE(flags,i0,i1,i2)       varray_triangle(varray,i0,i1,i2) +#define LINE(flags,i0,i1)              varray_line(varray,i0,i1) +#define POINT(i0)                      varray_point(varray,i0) +#define FUNC varray_decompose +#include "draw_pt_decompose.h" +#else +#define TRIANGLE(vc,i0,i1,i2)       varray_triangle(vc,i0,i1,i2) +#define LINE(vc,i0,i1)              varray_line(vc,i0,i1)  #define POINT(vc,i0)                varray_point(vc,i0)  #define FUNC varray_run -#include "draw_pt_varray_tmp.h" - - +#include "draw_pt_varray_tmp_linear.h" +#endif -static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = { +static unsigned decompose_prim[PIPE_PRIM_POLYGON + 1] = {     PIPE_PRIM_POINTS,     PIPE_PRIM_LINES, -   PIPE_PRIM_LINES, -   PIPE_PRIM_LINES, +   PIPE_PRIM_LINES,             /* decomposed LINELOOP */ +   PIPE_PRIM_LINE_STRIP,     PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES +   PIPE_PRIM_TRIANGLE_STRIP, +   PIPE_PRIM_TRIANGLES,         /* decomposed TRI_FAN */ +   PIPE_PRIM_QUADS, +   PIPE_PRIM_QUAD_STRIP, +   PIPE_PRIM_TRIANGLES          /* decomposed POLYGON */  }; @@ -201,17 +177,10 @@ static void varray_prepare(struct draw_pt_front_end *frontend,  {     struct varray_frontend *varray = (struct varray_frontend *)frontend; -   if (opt & PT_PIPELINE) -   { -      varray->base.run = varray_run_extras; -   }  -   else  -   { -      varray->base.run = varray_run; -   } +   varray->base.run = varray_run;     varray->input_prim = prim; -   varray->output_prim = reduced_prim[prim]; +   varray->output_prim = decompose_prim[prim];     varray->middle = middle;     middle->prepare(middle, varray->output_prim, opt); diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h index b9a319b253..6979f6b544 100644 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h +++ b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h @@ -10,32 +10,44 @@ static void FUNC(struct draw_pt_front_end *frontend,     boolean flatfirst = (draw->rasterizer->flatshade &&                          draw->rasterizer->flatshade_first); -   unsigned i, flags; +   unsigned i, j, flags; +   unsigned first, incr; + +   varray->fetch_start = start; + +   draw_pt_split_prim(varray->input_prim, &first, &incr);  #if 0 -   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count); -#endif -#if 0 -   debug_printf("INPUT PRIM = %d (start = %d, count = %d)\n", varray->input_prim, +   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, +                varray->input_prim,                  start, count);  #endif -   for (i = 0; i < count; ++i) { -      varray->fetch_elts[i] = start + i; -   } -   varray->fetch_count = count; -     switch (varray->input_prim) {     case PIPE_PRIM_POINTS: -      for (i = 0; i < count; i ++) { -         POINT(varray, i + 0); +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 0; i < end; i++) { +            POINT(varray, i + 0); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray);        }        break;     case PIPE_PRIM_LINES: -      for (i = 0; i+1 < count; i += 2) { -         LINE(varray, DRAW_PIPE_RESET_STIPPLE, -              i + 0, i + 1); +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 0; i+1 < end; i += 2) { +            LINE(varray, DRAW_PIPE_RESET_STIPPLE, +                 i + 0, i + 1); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray);        }        break; @@ -43,38 +55,81 @@ static void FUNC(struct draw_pt_front_end *frontend,        if (count >= 2) {           flags = DRAW_PIPE_RESET_STIPPLE; -         for (i = 1; i < count; i++, flags = 0) { -            LINE(varray, flags, i - 1, i); +         for (j = 0; j + first <= count; j += i) { +            unsigned end = MIN2(FETCH_MAX, count - j); +            end -= (end % incr); +            for (i = 1; i < end; i++, flags = 0) { +               LINE(varray, flags, i - 1, i); +            } +            LINE(varray, flags, i - 1, 0); +            i = end; +            fetch_init(varray, end); +            varray_flush(varray);           } -         LINE(varray, flags, i - 1, 0);        }        break;     case PIPE_PRIM_LINE_STRIP:        flags = DRAW_PIPE_RESET_STIPPLE; -      for (i = 1; i < count; i++, flags = 0) { -         LINE(varray, flags, i - 1, i); +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 1; i < end; i++, flags = 0) { +            LINE(varray, flags, i - 1, i); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray);        }        break;     case PIPE_PRIM_TRIANGLES: -      for (i = 0; i+2 < count; i += 3) { -         TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, -                  i + 0, i + 1, i + 2); +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 0; i+2 < end; i += 3) { +            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                     i + 0, i + 1, i + 2); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray);        }        break;     case PIPE_PRIM_TRIANGLE_STRIP:        if (flatfirst) { -         for (i = 0; i+2 < count; i++) { -            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, -                     i + 0, i + 1 + (i&1), i + 2 - (i&1)); +         for (j = 0; j + first <= count; j += i) { +            unsigned end = MIN2(FETCH_MAX, count - j); +            end -= (end % incr); +            for (i = 0; i+2 < end; i++) { +               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                        i + 0, i + 1 + (i&1), i + 2 - (i&1)); +            } +            i = end; +            fetch_init(varray, end); +            varray_flush(varray); +            if (j + first + i <= count) { +               varray->fetch_start -= 2; +               i -= 2; +            }           }        }        else { -         for (i = 0; i+2 < count; i++) { -            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, -                     i + 0 + (i&1), i + 1 - (i&1), i + 2); +         for (j = 0; j + first <= count; j += i) { +            unsigned end = MIN2(FETCH_MAX, count - j); +            end -= (end  % incr); +            for (i = 0; i + 2 < end; i++) { +               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, +                        i + 0 + (i&1), i + 1 - (i&1), i + 2); +            } +            i = end; +            fetch_init(varray, end); +            varray_flush(varray); +            if (j + first + i <= count) { +               varray->fetch_start -= 2; +               i -= 2; +            }           }        }        break; @@ -83,51 +138,89 @@ static void FUNC(struct draw_pt_front_end *frontend,        if (count >= 3) {           if (flatfirst) {              flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; -            for (i = 0; i+2 < count; i++) { -               TRIANGLE(varray, flags, i + 1, i + 2, 0); +            for (j = 0; j + first <= count; j += i) { +               unsigned end = MIN2(FETCH_MAX, count - j); +               end -= (end % incr); +               for (i = 0; i+2 < end; i++) { +                  TRIANGLE(varray, flags, i + 1, i + 2, 0); +               } +               i = end; +               fetch_init(varray, end); +               varray_flush(varray);              }           }           else {              flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; -            for (i = 0; i+2 < count; i++) { -               TRIANGLE(varray, flags, 0, i + 1, i + 2); +            for (j = 0; j + first <= count; j += i) { +               unsigned end = MIN2(FETCH_MAX, count - j); +               end -= (end % incr); +               for (i = 0; i+2 < end; i++) { +                  TRIANGLE(varray, flags, 0, i + 1, i + 2); +               } +               i = end; +               fetch_init(varray, end); +               varray_flush(varray);              }           }        }        break;     case PIPE_PRIM_QUADS: -      for (i = 0; i+3 < count; i += 4) { -         QUAD(varray, i + 0, i + 1, i + 2, i + 3); +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 0; i+3 < end; i += 4) { +            QUAD(varray, i + 0, i + 1, i + 2, i + 3); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray);        }        break;     case PIPE_PRIM_QUAD_STRIP: -      for (i = 0; i+3 < count; i += 2) { -         QUAD(varray, i + 2, i + 0, i + 1, i + 3); +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 0; i+3 < end; i += 2) { +            QUAD(varray, i + 2, i + 0, i + 1, i + 3); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray); +         if (j + first + i <= count) { +            varray->fetch_start -= 2; +            i -= 2; +         }        }        break;     case PIPE_PRIM_POLYGON:     { -         /* These bitflags look a little odd because we submit the -          * vertices as (1,2,0) to satisfy flatshade requirements. -          */ -         const unsigned edge_first  = DRAW_PIPE_EDGE_FLAG_2; -         const unsigned edge_middle = DRAW_PIPE_EDGE_FLAG_0; -         const unsigned edge_last   = DRAW_PIPE_EDGE_FLAG_1; - -         flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; +      /* These bitflags look a little odd because we submit the +       * vertices as (1,2,0) to satisfy flatshade requirements. +       */ +      const unsigned edge_first  = DRAW_PIPE_EDGE_FLAG_2; +      const unsigned edge_middle = DRAW_PIPE_EDGE_FLAG_0; +      const unsigned edge_last   = DRAW_PIPE_EDGE_FLAG_1; -	 for (i = 0; i+2 < count; i++, flags = edge_middle) { +      flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 0; i+2 < end; i++, flags = edge_middle) {              if (i + 3 == count)                 flags |= edge_last; -	    TRIANGLE(varray, flags, i + 1, i + 2, 0); -	 } +            TRIANGLE(varray, flags, i + 1, i + 2, 0); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray);        } -      break; +   } +   break;     default:        assert(0); diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h new file mode 100644 index 0000000000..114ed371a0 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h @@ -0,0 +1,94 @@ +static unsigned trim( unsigned count, unsigned first, unsigned incr ) +{ +   return count - (count - first) % incr;  +} + +static void FUNC(struct draw_pt_front_end *frontend, +                 pt_elt_func get_elt, +                 const void *elts, +                 unsigned count) +{ +   struct varray_frontend *varray = (struct varray_frontend *)frontend; +   unsigned start = (unsigned)elts; + +   unsigned i, j; +   unsigned first, incr; + +   varray->fetch_start = start; + +   draw_pt_split_prim(varray->input_prim, &first, &incr); +    +   /* Sanitize primitive length: +    */ +   count = trim(count, first, incr);  +   if (count < first) +      return; + +#if 0 +   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, +                varray->input_prim, +                start, count); +#endif + +   switch (varray->input_prim) { +   case PIPE_PRIM_POINTS: +   case PIPE_PRIM_LINES: +   case PIPE_PRIM_TRIANGLES: +   case PIPE_PRIM_LINE_STRIP: +   case PIPE_PRIM_TRIANGLE_STRIP: +   case PIPE_PRIM_QUADS: +   case PIPE_PRIM_QUAD_STRIP: +      for (j = 0; j < count;) { +         unsigned remaining = count - j; +         unsigned nr = trim( MIN2(FETCH_MAX, remaining), first, incr ); +         varray_flush_linear(varray, start + j, nr); +         j += nr; +         if (nr != remaining)  +            j -= (first - incr); +      } +      break; + +   case PIPE_PRIM_LINE_LOOP: +      if (count >= 2) { +         for (j = 0; j + first <= count; j += i) { +            unsigned end = MIN2(FETCH_MAX, count - j); +            end -= (end % incr); +            for (i = 1; i < end; i++) { +               LINE(varray, i - 1, i); +            } +            LINE(varray, i - 1, 0); +            i = end; +            fetch_init(varray, end); +            varray_flush(varray); +         } +      } +      break; + + +   case PIPE_PRIM_POLYGON: +   case PIPE_PRIM_TRIANGLE_FAN: +      for (j = 0; j + first <= count; j += i) { +         unsigned end = MIN2(FETCH_MAX, count - j); +         end -= (end % incr); +         for (i = 2; i < end; i++) { +            TRIANGLE(varray, 0, i - 1, i); +         } +         i = end; +         fetch_init(varray, end); +         varray_flush(varray); +      } +      break; + +   default: +      assert(0); +      break; +   } + +   varray_flush(varray); +} + +#undef TRIANGLE +#undef QUAD +#undef POINT +#undef LINE +#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c index 6b3fb1406b..96e02fbf3a 100644 --- a/src/gallium/auxiliary/draw/draw_pt_vcache.c +++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c @@ -171,15 +171,15 @@ static void vcache_ef_quad( struct vcache_frontend *vcache,                              unsigned i2,                              unsigned i3 )  { -   const unsigned omitEdge1 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_2; -   const unsigned omitEdge2 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1; - -   vcache_triangle_flags( vcache,  -                          DRAW_PIPE_RESET_STIPPLE | omitEdge1,  +   vcache_triangle_flags( vcache, +                          ( DRAW_PIPE_RESET_STIPPLE | +                            DRAW_PIPE_EDGE_FLAG_0 | +                            DRAW_PIPE_EDGE_FLAG_2 ),                            i0, i1, i3 ); -   vcache_triangle_flags( vcache,  -                          omitEdge2,  +   vcache_triangle_flags( vcache, +                          ( DRAW_PIPE_EDGE_FLAG_0 | +                            DRAW_PIPE_EDGE_FLAG_1 ),                            i1, i2, i3 );  } @@ -204,19 +204,6 @@ static void vcache_ef_quad( struct vcache_frontend *vcache, -static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = { -   PIPE_PRIM_POINTS, -   PIPE_PRIM_LINES, -   PIPE_PRIM_LINES, -   PIPE_PRIM_LINES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES, -   PIPE_PRIM_TRIANGLES -}; -  static void vcache_prepare( struct draw_pt_front_end *frontend, @@ -236,7 +223,7 @@ static void vcache_prepare( struct draw_pt_front_end *frontend,     }     vcache->input_prim = prim; -   vcache->output_prim = reduced_prim[prim]; +   vcache->output_prim = draw_pt_reduced_prim(prim);     vcache->middle = middle;     middle->prepare( middle, vcache->output_prim, opt ); diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h index 6d8bac5138..16c65c4317 100644 --- a/src/gallium/auxiliary/draw/draw_vertex.h +++ b/src/gallium/auxiliary/draw/draw_vertex.h @@ -109,4 +109,25 @@ extern void draw_compute_vertex_size(struct vertex_info *vinfo);  void draw_dump_emitted_vertex(const struct vertex_info *vinfo,                                 const uint8_t *data); + +static INLINE unsigned draw_translate_vinfo_format(unsigned format ) +{ +   switch (format) { +   case EMIT_1F: +   case EMIT_1F_PSIZE: +      return PIPE_FORMAT_R32_FLOAT; +   case EMIT_2F: +      return PIPE_FORMAT_R32G32_FLOAT; +   case EMIT_3F: +      return PIPE_FORMAT_R32G32B32_FLOAT; +   case EMIT_4F: +      return PIPE_FORMAT_R32G32B32A32_FLOAT; +   case EMIT_4UB: +      return PIPE_FORMAT_R8G8B8A8_UNORM; +   default: +      return PIPE_FORMAT_NONE; +   } +} + +  #endif /* DRAW_VERTEX_H */ diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index 03fe00a951..9b899d404e 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -36,6 +36,8 @@  #include "draw_private.h"  #include "draw_context.h"  #include "draw_vs.h" +#include "translate/translate.h" +#include "translate/translate_cache.h" @@ -66,13 +68,13 @@ draw_bind_vertex_shader(struct draw_context *draw,     if (dvs)      { -      draw->vertex_shader = dvs; -      draw->num_vs_outputs = dvs->info.num_outputs; +      draw->vs.vertex_shader = dvs; +      draw->vs.num_vs_outputs = dvs->info.num_outputs;        dvs->prepare( dvs, draw );     }     else { -      draw->vertex_shader = NULL; -      draw->num_vs_outputs = 0; +      draw->vs.vertex_shader = NULL; +      draw->vs.num_vs_outputs = 0;     }  } @@ -83,3 +85,109 @@ draw_delete_vertex_shader(struct draw_context *draw,  {     dvs->delete( dvs );  } + + + +boolean  +draw_vs_init( struct draw_context *draw ) +{ +   tgsi_exec_machine_init(&draw->vs.machine); + +   /* FIXME: give this machine thing a proper constructor: +    */ +   draw->vs.machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); +   if (!draw->vs.machine.Inputs) +      return FALSE; + +   draw->vs.machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16); +   if (!draw->vs.machine.Outputs) +      return FALSE; + +   draw->vs.emit_cache = translate_cache_create(); +   if (!draw->vs.emit_cache)  +      return FALSE; +       +   draw->vs.fetch_cache = translate_cache_create(); +   if (!draw->vs.fetch_cache)  +      return FALSE; +       +   return TRUE; +} + +void +draw_vs_destroy( struct draw_context *draw ) +{ +   if (draw->vs.machine.Inputs) +      align_free(draw->vs.machine.Inputs); + +   if (draw->vs.machine.Outputs) +      align_free(draw->vs.machine.Outputs); + +   if (draw->vs.fetch_cache) +      translate_cache_destroy(draw->vs.fetch_cache); + +   if (draw->vs.emit_cache) +      translate_cache_destroy(draw->vs.emit_cache); + +   tgsi_exec_machine_free_data(&draw->vs.machine); + +} + + +struct draw_vs_varient * +draw_vs_lookup_varient( struct draw_vertex_shader *vs, +                        const struct draw_vs_varient_key *key ) +{ +   struct draw_vs_varient *varient; +   unsigned i; + +   /* Lookup existing varient:  +    */ +   for (i = 0; i < vs->nr_varients; i++) +      if (draw_vs_varient_key_compare(key, &vs->varient[i]->key) == 0) +         return vs->varient[i]; +    +   /* Else have to create a new one:  +    */ +   varient = vs->create_varient( vs, key ); +   if (varient == NULL) +      return NULL; + +   /* Add it to our list:  +    */ +   assert(vs->nr_varients < Elements(vs->varient)); +   vs->varient[vs->nr_varients++] = varient; + +   /* Done  +    */ +   return varient; +} + + +struct translate * +draw_vs_get_fetch( struct draw_context *draw, +                   struct translate_key *key ) +{ +   if (!draw->vs.fetch || +       translate_key_compare(&draw->vs.fetch->key, key) != 0)  +   { +      translate_key_sanitize(key); +      draw->vs.fetch = translate_cache_find(draw->vs.fetch_cache, key); +   } +    +   return draw->vs.fetch; +} + +struct translate * +draw_vs_get_emit( struct draw_context *draw, +                  struct translate_key *key ) +{ +   if (!draw->vs.emit || +       translate_key_compare(&draw->vs.emit->key, key) != 0)  +   { +      translate_key_sanitize(key); +      draw->vs.emit = translate_cache_find(draw->vs.emit_cache, key); +   } +    +   return draw->vs.emit; +} diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index f9772b83b8..01171bc23d 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -38,10 +38,84 @@  struct draw_context;  struct pipe_shader_state; +struct draw_varient_input  +{ +   enum pipe_format format; +   unsigned buffer; +   unsigned offset;  +}; + +struct draw_varient_output +{ +   enum pipe_format format;     /* output format */ +   unsigned vs_output:8;        /* which vertex shader output is this? */ +   unsigned offset:24;          /* offset into output vertex */ +}; + +struct draw_varient_element { +   struct draw_varient_input in; +   struct draw_varient_output out; +}; + +struct draw_vs_varient_key { +   unsigned output_stride; +   unsigned nr_elements:8;      /* max2(nr_inputs, nr_outputs) */ +   unsigned nr_inputs:8; +   unsigned nr_outputs:8; +   unsigned viewport:1; +   unsigned clip:1; +   unsigned pad:5; +   struct draw_varient_element element[PIPE_MAX_ATTRIBS]; +}; + +struct draw_vs_varient; + +typedef void (PIPE_CDECL *vsv_run_elts_func)( struct draw_vs_varient *, +                                              const unsigned *elts, +                                              unsigned count, +                                              void *output_buffer); + +typedef void (PIPE_CDECL *vsv_run_linear_func)( struct draw_vs_varient *, +                                                unsigned start, +                                                unsigned count, +                                                void *output_buffer); + + +struct draw_vs_varient { +   struct draw_vs_varient_key key; + +   struct draw_vertex_shader *vs; + +   void (*set_input)( struct draw_vs_varient *, +                      unsigned i, +                      const void *ptr, +                      unsigned stride ); + +   void (*set_constants)( struct draw_vs_varient *, +                          const float (*constants)[4] ); + +   void (*set_viewport)( struct draw_vs_varient *, +                         const struct pipe_viewport_state * ); + +   void (PIPE_CDECL *run_linear)( struct draw_vs_varient *shader, +                                  unsigned start, +                                  unsigned count, +                                  void *output_buffer ); + +   void (PIPE_CDECL *run_elts)( struct draw_vs_varient *shader, +                                const unsigned *elts, +                                unsigned count, +                                void *output_buffer ); + +   void (*destroy)( struct draw_vs_varient * ); +}; + +  /**   * Private version of the compiled vertex_shader   */  struct draw_vertex_shader { +   struct draw_context *draw;     /* This member will disappear shortly:      */ @@ -49,6 +123,14 @@ struct draw_vertex_shader {     struct tgsi_shader_info info; +   /*  +    */ +   struct draw_vs_varient *varient[16]; +   unsigned nr_varients; +   struct draw_vs_varient *(*create_varient)( struct draw_vertex_shader *shader, +                                              const struct draw_vs_varient_key *key ); + +     void (*prepare)( struct draw_vertex_shader *shader,  		    struct draw_context *draw ); @@ -68,6 +150,15 @@ struct draw_vertex_shader {  }; +struct draw_vs_varient * +draw_vs_lookup_varient( struct draw_vertex_shader *base, +                        const struct draw_vs_varient_key *key ); + + +/******************************************************************************** + * Internal functions: + */ +  struct draw_vertex_shader *  draw_create_vs_exec(struct draw_context *draw,  		    const struct pipe_shader_state *templ); @@ -81,7 +172,52 @@ draw_create_vs_llvm(struct draw_context *draw,  		    const struct pipe_shader_state *templ); + +struct draw_vs_varient_key; +struct draw_vertex_shader; + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ); + + + +/******************************************************************************** + * Helpers for vs implementations that don't do their own fetch/emit varients. + * Means these can be shared between shaders. + */ +struct translate; +struct translate_key; + +struct translate *draw_vs_get_fetch( struct draw_context *draw, +                                     struct translate_key *key ); + + +struct translate *draw_vs_get_emit( struct draw_context *draw, +                                    struct translate_key *key ); + +struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ); + + + +static INLINE int draw_vs_varient_keysize( const struct draw_vs_varient_key *key ) +{ +   return 2 * sizeof(int) + key->nr_elements * sizeof(struct draw_varient_element); +} + +static INLINE int draw_vs_varient_key_compare( const struct draw_vs_varient_key *a, +                                         const struct draw_vs_varient_key *b ) +{ +   int keysize = draw_vs_varient_keysize(a); +   return memcmp(a, b, keysize); +} + + + + +  #define MAX_TGSI_VERTICES 4 +  #endif diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c new file mode 100644 index 0000000000..d3770b2c53 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -0,0 +1,2266 @@ +/* + * Mesa 3-D graphics library + * Version:  6.3 + * + * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code + * using the rtasm runtime assembler.  Based on the old + * t_vb_arb_program_sse.c + */ + + +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/util/tgsi_parse.h" +#include "tgsi/util/tgsi_util.h" +#include "tgsi/exec/tgsi_exec.h" +#include "tgsi/util/tgsi_dump.h" + +#include "draw_vs.h" +#include "draw_vs_aos.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 +#define DISASSEM 0 + +static const char *files[] = +{ +   "NULL", +   "CONST", +   "IN", +   "OUT", +   "TEMP", +   "SAMP", +   "ADDR", +   "IMM", +   "INTERNAL", +}; + +static INLINE boolean eq( struct x86_reg a, +			    struct x86_reg b ) +{ +   return (a.file == b.file && +	   a.idx == b.idx && +	   a.mod == b.mod && +	   a.disp == b.disp); +} +       + +static struct x86_reg get_reg_ptr(struct aos_compilation *cp, +                                  unsigned file, +				  unsigned idx ) +{ +   struct x86_reg ptr = cp->machine_EDX; + +   switch (file) { +   case TGSI_FILE_INPUT: +      return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); + +   case TGSI_FILE_OUTPUT: +      return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); + +   case TGSI_FILE_TEMPORARY: +      return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); + +   case TGSI_FILE_IMMEDIATE: +      return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + +   case TGSI_FILE_CONSTANT:        +      return x86_make_disp(ptr, Offset(struct aos_machine, constant[idx])); + +   case AOS_FILE_INTERNAL: +      return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx])); + +   default: +      ERROR(cp, "unknown reg file"); +      return x86_make_reg(0,0); +   } +} +		 + + +#define X87_CW_EXCEPTION_INV_OP       (1<<0) +#define X87_CW_EXCEPTION_DENORM_OP    (1<<1) +#define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2) +#define X87_CW_EXCEPTION_OVERFLOW     (1<<3) +#define X87_CW_EXCEPTION_UNDERFLOW    (1<<4) +#define X87_CW_EXCEPTION_PRECISION    (1<<5) +#define X87_CW_PRECISION_SINGLE       (0<<8) +#define X87_CW_PRECISION_RESERVED     (1<<8) +#define X87_CW_PRECISION_DOUBLE       (2<<8) +#define X87_CW_PRECISION_DOUBLE_EXT   (3<<8) +#define X87_CW_PRECISION_MASK         (3<<8) +#define X87_CW_ROUND_NEAREST          (0<<10) +#define X87_CW_ROUND_DOWN             (1<<10) +#define X87_CW_ROUND_UP               (2<<10) +#define X87_CW_ROUND_ZERO             (3<<10) +#define X87_CW_ROUND_MASK             (3<<10) +#define X87_CW_INFINITY               (1<<12) + +static void do_populate_lut( struct shine_tab *tab, +                             float unclamped_exponent ) +{ +   const float epsilon = 1.0F / 256.0F;     +   float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon)); +   unsigned i; + +   tab->exponent = unclamped_exponent; /* for later comparison */ +    +   tab->values[0] = 0; +   if (exponent == 0) { +      for (i = 1; i < 258; i++) { +         tab->values[i] = 1.0; +      }       +   } +   else { +      for (i = 1; i < 258; i++) { +         tab->values[i] = powf((float)i * epsilon, exponent); +      } +   } +} + +static void init_internals( struct aos_machine *machine ) +{ +   unsigned i; +   float inv = 1.0f/255.0f; +   float f255 = 255.0f; + +   ASSIGN_4V(machine->internal[IMM_SWZ],       1.0f,  -1.0f,  0.0f, 1.0f); +   *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff; + +   ASSIGN_4V(machine->internal[IMM_ONES],      1.0f,  1.0f,  1.0f,  1.0f); +   ASSIGN_4V(machine->internal[IMM_NEGS],     -1.0f, -1.0f, -1.0f, -1.0f); +   ASSIGN_4V(machine->internal[IMM_IDENTITY],  0.0f,  0.0f,  0.0f,  1.0f); +   ASSIGN_4V(machine->internal[IMM_INV_255],   inv,   inv,   inv,   inv); +   ASSIGN_4V(machine->internal[IMM_255],       f255,  f255,  f255,  f255); +   ASSIGN_4V(machine->internal[IMM_RSQ],       -.5f,  1.5f,  0.0f,  0.0f); + + +   machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP | +                               X87_CW_EXCEPTION_DENORM_OP | +                               X87_CW_EXCEPTION_ZERO_DIVIDE | +                               X87_CW_EXCEPTION_OVERFLOW | +                               X87_CW_EXCEPTION_UNDERFLOW | +                               X87_CW_EXCEPTION_PRECISION | +                               (1<<6) | +                               X87_CW_ROUND_NEAREST | +                               X87_CW_PRECISION_DOUBLE_EXT); + +   assert(machine->fpu_rnd_nearest == 0x37f); +                                +   machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP | +                               X87_CW_EXCEPTION_DENORM_OP | +                               X87_CW_EXCEPTION_ZERO_DIVIDE | +                               X87_CW_EXCEPTION_OVERFLOW | +                               X87_CW_EXCEPTION_UNDERFLOW | +                               X87_CW_EXCEPTION_PRECISION | +                               (1<<6) | +                               X87_CW_ROUND_DOWN | +                               X87_CW_PRECISION_DOUBLE_EXT); + +   for (i = 0; i < MAX_SHINE_TAB; i++) +      do_populate_lut( &machine->shine_tab[i], 1.0f ); +} + + +static void spill( struct aos_compilation *cp, unsigned idx ) +{ +   if (!cp->xmm[idx].dirty || +       (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ +        cp->xmm[idx].file != TGSI_FILE_OUTPUT && +        cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { +      ERROR(cp, "invalid spill"); +      return; +   } +   else { +      struct x86_reg oldval = get_reg_ptr(cp, +                                          cp->xmm[idx].file, +                                          cp->xmm[idx].idx); +      +      if (0) debug_printf("\nspill %s[%d]",  +                          files[cp->xmm[idx].file], +                          cp->xmm[idx].idx); +  +      assert(cp->xmm[idx].dirty); +      sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx)); +      cp->xmm[idx].dirty = 0; +   } +} + + +static struct x86_reg get_xmm_writable( struct aos_compilation *cp, +                                        struct x86_reg reg ) +{ +   if (reg.file != file_XMM || +       cp->xmm[reg.idx].file != TGSI_FILE_NULL) +   { +      struct x86_reg tmp = aos_get_xmm_reg(cp); +      sse_movaps(cp->func, tmp, reg); +      reg = tmp; +   } + +   cp->xmm[reg.idx].last_used = cp->insn_counter; +   return reg; +} + +static struct x86_reg get_xmm( struct aos_compilation *cp, +                               struct x86_reg reg ) +{ +   if (reg.file != file_XMM)  +   { +      struct x86_reg tmp = aos_get_xmm_reg(cp); +      sse_movaps(cp->func, tmp, reg); +      reg = tmp; +   } + +   cp->xmm[reg.idx].last_used = cp->insn_counter; +   return reg; +} + + +/* Allocate an empty xmm register, either as a temporary or later to + * "adopt" as a shader reg. + */ +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) +{ +   unsigned i; +   unsigned oldest = 0; +   boolean found = FALSE; + +   for (i = 0; i < 8; i++)  +      if (cp->xmm[i].last_used != cp->insn_counter && +          cp->xmm[i].file == TGSI_FILE_NULL) { +	 oldest = i; +         found = TRUE; +      } + +   if (!found) { +      for (i = 0; i < 8; i++)  +         if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) +            oldest = i; +   } + +   /* Need to write out the old value? +    */ +   if (cp->xmm[oldest].dirty)  +      spill(cp, oldest); + +   assert(cp->xmm[oldest].last_used != cp->insn_counter); + +   cp->xmm[oldest].file = TGSI_FILE_NULL; +   cp->xmm[oldest].idx = 0; +   cp->xmm[oldest].dirty = 0; +   cp->xmm[oldest].last_used = cp->insn_counter; +   return x86_make_reg(file_XMM, oldest); +} + +void aos_release_xmm_reg( struct aos_compilation *cp, +                          unsigned idx ) +{ +   cp->xmm[idx].file = TGSI_FILE_NULL; +   cp->xmm[idx].idx = 0; +   cp->xmm[idx].dirty = 0; +   cp->xmm[idx].last_used = 0; +} + + + +      +/* Mark an xmm reg as holding the current copy of a shader reg. + */ +void aos_adopt_xmm_reg( struct aos_compilation *cp, +                        struct x86_reg reg, +                        unsigned file, +                        unsigned idx, +                        unsigned dirty ) +{ +   unsigned i; + +   if (reg.file != file_XMM) { +      assert(0); +      return; +   } + + +   /* If any xmm reg thinks it holds this shader reg, break the +    * illusion. +    */ +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].file == file &&  +          cp->xmm[i].idx == idx)  +      { +         /* If an xmm reg is already holding this shader reg, take into account its +          * dirty flag... +          */ +         dirty |= cp->xmm[i].dirty; +         aos_release_xmm_reg(cp, i); +      } +   } + +   cp->xmm[reg.idx].file = file; +   cp->xmm[reg.idx].idx = idx; +   cp->xmm[reg.idx].dirty = dirty; +   cp->xmm[reg.idx].last_used = cp->insn_counter; +} + + +/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate. + */ +static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,  +                                              unsigned file, +                                              unsigned idx ) +{ +   unsigned i; + +   /* Ensure the in-memory copy of this reg is up-to-date +    */ +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].file == file &&  +          cp->xmm[i].idx == idx && +          cp->xmm[i].dirty) { +         spill(cp, i); +      } +   } + +   return get_reg_ptr( cp, file, idx ); +} + + +/* As above, but return a pointer.  Note - this pointer may alias + * those returned by get_arg_ptr(). + */ +static struct x86_reg get_dst_ptr( struct aos_compilation *cp,  +                                   const struct tgsi_full_dst_register *dst ) +{ +   unsigned file = dst->DstRegister.File; +   unsigned idx = dst->DstRegister.Index; +   unsigned i; +    + +   /* Ensure in-memory copy of this reg is up-to-date and invalidate +    * any xmm copies. +    */ +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].file == file && +          cp->xmm[i].idx == idx) +      { +         if (cp->xmm[i].dirty)  +            spill(cp, i); +          +         aos_release_xmm_reg(cp, i); +      } +   } + +   return get_reg_ptr( cp, file, idx ); +} + + + + + +/* Return an XMM reg if the argument is resident, otherwise return a + * base+offset pointer to the saved value. + */ +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,  +                                   unsigned file, +                                   unsigned idx ) +{ +   unsigned i; + +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].file == file && +	  cp->xmm[i].idx  == idx)  +      { +	 cp->xmm[i].last_used = cp->insn_counter; +	 return x86_make_reg(file_XMM, i); +      } +   } + +   /* If not found in the XMM register file, return an indirect +    * reference to the in-memory copy: +    */ +   return get_reg_ptr( cp, file, idx ); +} + + + +static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,  +                                              unsigned file, +                                              unsigned idx ) +{ +   struct x86_reg reg = get_xmm( cp, +                                 aos_get_shader_reg( cp, file, idx ) ); + +   aos_adopt_xmm_reg( cp, +                      reg, +                      file, +                      idx, +                      FALSE ); +    +   return reg; +} + + + +struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, +                                     unsigned imm ) +{ +   return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm ); +} + + +struct x86_reg aos_get_internal( struct aos_compilation *cp, +                                 unsigned imm ) +{ +   return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm ); +} + + + + + +/* Emulate pshufd insn in regular SSE, if necessary: + */ +static void emit_pshufd( struct aos_compilation *cp, +			 struct x86_reg dst, +			 struct x86_reg arg0, +			 ubyte shuf ) +{ +   if (cp->have_sse2) { +      sse2_pshufd(cp->func, dst, arg0, shuf); +   } +   else { +      if (!eq(dst, arg0))  +	 sse_movaps(cp->func, dst, arg0); + +      sse_shufps(cp->func, dst, dst, shuf); +   } +} + +/* load masks (pack into negs??) + * pshufd - shuffle according to writemask + * and - result, mask + * nand - dest, mask + * or - dest, result + */ +static boolean mask_write( struct aos_compilation *cp, +                           struct x86_reg dst, +                           struct x86_reg result, +                           unsigned mask ) +{ +   struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); +   struct x86_reg tmp = aos_get_xmm_reg(cp); +    +   emit_pshufd(cp, tmp, imm_swz,  +               SHUF((mask & 1) ? 2 : 3, +                    (mask & 2) ? 2 : 3, +                    (mask & 4) ? 2 : 3, +                    (mask & 8) ? 2 : 3)); + +   sse_andps(cp->func, dst, tmp); +   sse_andnps(cp->func, tmp, result); +   sse_orps(cp->func, dst, tmp); + +   aos_release_xmm_reg(cp, tmp.idx); +   return TRUE; +} + + + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy2( struct aos_compilation *cp, +				  struct x86_reg dst, +				  struct x86_reg arg0, +				  struct x86_reg arg1, +				  ubyte shuf ) +{ +   struct x86_reg tmp = aos_get_xmm_reg(cp); + +   emit_pshufd(cp, dst, arg1, shuf); +   emit_pshufd(cp, tmp, arg0, shuf); +   sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); +   emit_pshufd(cp, dst, dst, shuf); + +   aos_release_xmm_reg(cp, tmp.idx); +   return TRUE; +} + + + +#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) + + +/* Locate a source register and perform any required (simple) swizzle.   + *  + * Just fail on complex swizzles at this point. + */ +static struct x86_reg fetch_src( struct aos_compilation *cp,  +                                 const struct tgsi_full_src_register *src )  +{ +   struct x86_reg arg0 = aos_get_shader_reg(cp,  +                                            src->SrcRegister.File,  +                                            src->SrcRegister.Index); +   unsigned i; +   unsigned swz = 0; +   unsigned negs = 0; +   unsigned abs = 0; + +   for (i = 0; i < 4; i++) { +      unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i ); +      unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); + +      switch (swizzle) { +      case TGSI_EXTSWIZZLE_ZERO: +      case TGSI_EXTSWIZZLE_ONE: +         ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2"); +         break; + +      default: +         swz |= (swizzle & 0x3) << (i * 2); +         break; +      } + +      switch (neg) { +      case TGSI_UTIL_SIGN_TOGGLE: +         negs |= (1<<i); +         break; +          +      case TGSI_UTIL_SIGN_KEEP: +         break; + +      case TGSI_UTIL_SIGN_CLEAR: +         abs |= (1<<i); +         break; + +      default: +         ERROR(cp, "unsupported sign-mode"); +         break; +      } +   } + +   if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) { +      struct x86_reg dst = aos_get_xmm_reg(cp); + +      if (swz != SSE_SWIZZLE_NOOP) { +         emit_pshufd(cp, dst, arg0, swz); +         arg0 = dst; +      } + +      if (negs && negs != 0xf) { +         struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); +         struct x86_reg tmp = aos_get_xmm_reg(cp); + +         /* Load 1,-1,0,0 +          * Use neg as arg to pshufd +          * Multiply +          */ +         emit_pshufd(cp, tmp, imm_swz,  +                     SHUF((negs & 1) ? 1 : 0, +                          (negs & 2) ? 1 : 0, +                          (negs & 4) ? 1 : 0, +                          (negs & 8) ? 1 : 0)); +         sse_mulps(cp->func, dst, arg0); + +         aos_release_xmm_reg(cp, tmp.idx); +         arg0 = dst; +      } +      else if (negs) { +         struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS); +         sse_mulps(cp->func, dst, imm_negs); +         arg0 = dst; +      } + + +      if (abs && abs != 0xf) { +         ERROR(cp, "unsupported partial abs"); +      } +      else if (abs) { +         struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); +         struct x86_reg tmp = aos_get_xmm_reg(cp); + +         sse_movaps(cp->func, tmp, arg0); +         sse_mulps(cp->func, tmp, neg); +         sse_maxps(cp->func, dst, arg0); + +         aos_release_xmm_reg(cp, tmp.idx); +         arg0 = dst; +      } +   } +       +   return arg0; +} + +static void x87_fld_src( struct aos_compilation *cp,  +                         const struct tgsi_full_src_register *src, +                         unsigned channel )  +{ +   struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,  +                                                src->SrcRegister.File,  +                                                src->SrcRegister.Index); + +   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel ); +   unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); + +   switch (swizzle) { +   case TGSI_EXTSWIZZLE_ZERO: +      x87_fldz( cp->func ); +      break; + +   case TGSI_EXTSWIZZLE_ONE: +      x87_fld1( cp->func ); +      break; + +   default: +      x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); +      break; +   } +    + +   switch (neg) { +   case TGSI_UTIL_SIGN_TOGGLE: +      /* Flip the sign: +       */ +      x87_fchs( cp->func ); +      break; +          +   case TGSI_UTIL_SIGN_KEEP: +      break; + +   case TGSI_UTIL_SIGN_CLEAR: +      x87_fabs( cp->func ); +      break; + +   case TGSI_UTIL_SIGN_SET: +      x87_fabs( cp->func ); +      x87_fchs( cp->func ); +      break; + +   default: +      ERROR(cp, "unsupported sign-mode"); +      break; +   } +} + + + + + + +/* Used to implement write masking.  This and most of the other instructions + * here would be easier to implement if there had been a translation + * to a 2 argument format (dst/arg0, arg1) at the shader level before + * attempting to translate to x86/sse code. + */ +static void store_dest( struct aos_compilation *cp,  +                        const struct tgsi_full_dst_register *reg, +                        struct x86_reg result ) +{ +   struct x86_reg dst; + +   switch (reg->DstRegister.WriteMask) { +   case 0: +      return; +    +   case TGSI_WRITEMASK_XYZW: +      aos_adopt_xmm_reg(cp,  +                        get_xmm_writable(cp, result),  +                        reg->DstRegister.File, +                        reg->DstRegister.Index, +                        TRUE); +      return; +   default:  +      break; +   } + +   dst = aos_get_shader_reg_xmm(cp,  +                                reg->DstRegister.File, +                                reg->DstRegister.Index); + +   switch (reg->DstRegister.WriteMask) { +   case TGSI_WRITEMASK_X: +      sse_movss(cp->func, dst, get_xmm(cp, result)); +      break; +       +   case TGSI_WRITEMASK_ZW: +      sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W)); +      break; + +   case TGSI_WRITEMASK_XY:  +      result = get_xmm_writable(cp, result); +      sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); +      dst = result; +      break; + +   case TGSI_WRITEMASK_YZW:  +      result = get_xmm_writable(cp, result); +      sse_movss(cp->func, result, dst); +      dst = result; +      break; + +   default: +      mask_write(cp, dst, result, reg->DstRegister.WriteMask); +      break; +   } + +   aos_adopt_xmm_reg(cp,  +                     dst,  +                     reg->DstRegister.File, +                     reg->DstRegister.Index, +                     TRUE); + +} + +static void inject_scalar( struct aos_compilation *cp, +                           struct x86_reg dst, +                           struct x86_reg result, +                           unsigned swizzle ) +{ +   sse_shufps(cp->func, dst, dst, swizzle); +   sse_movss(cp->func, dst, result); +   sse_shufps(cp->func, dst, dst, swizzle); +} + + +static void store_scalar_dest( struct aos_compilation *cp,  +                               const struct tgsi_full_dst_register *reg, +                               struct x86_reg result ) +{ +   unsigned writemask = reg->DstRegister.WriteMask; +   struct x86_reg dst; + +   if (writemask != TGSI_WRITEMASK_X && +       writemask != TGSI_WRITEMASK_Y && +       writemask != TGSI_WRITEMASK_Z && +       writemask != TGSI_WRITEMASK_W && +       writemask != 0)  +   { +      result = get_xmm_writable(cp, result); /* already true, right? */ +      sse_shufps(cp->func, result, result, SHUF(X,X,X,X)); +      store_dest(cp, reg, result); +      return; +   } + +   result = get_xmm(cp, result); +   dst = aos_get_shader_reg_xmm(cp,  +                                reg->DstRegister.File, +                                reg->DstRegister.Index); + + + +   switch (reg->DstRegister.WriteMask) { +   case TGSI_WRITEMASK_X: +      sse_movss(cp->func, dst, result); +      break; + +   case TGSI_WRITEMASK_Y: +      inject_scalar(cp, dst, result, SHUF(Y, X, Z, W)); +      break; + +   case TGSI_WRITEMASK_Z: +      inject_scalar(cp, dst, result, SHUF(Z, Y, X, W)); +      break; + +   case TGSI_WRITEMASK_W: +      inject_scalar(cp, dst, result, SHUF(W, Y, Z, X)); +      break; + +   default: +      break; +   } + +   aos_adopt_xmm_reg(cp,  +                     dst,  +                     reg->DstRegister.File, +                     reg->DstRegister.Index, +                     TRUE); +} +    + + +static void x87_fst_or_nop( struct x86_function *func, +                            unsigned writemask, +                            unsigned channel, +                            struct x86_reg ptr ) +{ +   assert(ptr.file == file_REG32); +   if (writemask & (1<<channel))  +      x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) ); +} + +static void x87_fstp_or_pop( struct x86_function *func, +                             unsigned writemask, +                             unsigned channel, +                             struct x86_reg ptr ) +{ +   assert(ptr.file == file_REG32); +   if (writemask & (1<<channel))  +      x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) ); +   else +      x87_fstp( func, x86_make_reg( file_x87, 0 )); +} + + + +/*  + */ +static void x87_fstp_dest4( struct aos_compilation *cp, +                            const struct tgsi_full_dst_register *dst ) +{ +   struct x86_reg ptr = get_dst_ptr(cp, dst);  +   unsigned writemask = dst->DstRegister.WriteMask; + +   x87_fst_or_nop(cp->func, writemask, 0, ptr); +   x87_fst_or_nop(cp->func, writemask, 1, ptr); +   x87_fst_or_nop(cp->func, writemask, 2, ptr); +   x87_fstp_or_pop(cp->func, writemask, 3, ptr); +} + +/* Save current x87 state and put it into single precision mode. + */ +static void save_fpu_state( struct aos_compilation *cp ) +{ +   x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,  +                                       Offset(struct aos_machine, fpu_restore))); +} + +static void restore_fpu_state( struct aos_compilation *cp ) +{ +   x87_fnclex(cp->func); +   x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,  +                                      Offset(struct aos_machine, fpu_restore))); +} + +static void set_fpu_round_neg_inf( struct aos_compilation *cp ) +{ +   if (cp->fpucntl != FPU_RND_NEG) { +      cp->fpucntl = FPU_RND_NEG; +      x87_fnclex(cp->func); +      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,  +                                         Offset(struct aos_machine, fpu_rnd_neg_inf))); +   } +} + +static void set_fpu_round_nearest( struct aos_compilation *cp ) +{ +   if (cp->fpucntl != FPU_RND_NEAREST) { +      cp->fpucntl = FPU_RND_NEAREST; +      x87_fnclex(cp->func); +      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,  +                                         Offset(struct aos_machine, fpu_rnd_nearest))); +   } +} + + +static void x87_emit_ex2( struct aos_compilation *cp ) +{ +   struct x86_reg st0 = x86_make_reg(file_x87, 0); +   struct x86_reg st1 = x86_make_reg(file_x87, 1); +   int stack = cp->func->x87_stack; + +//   set_fpu_round_neg_inf( cp ); + +   x87_fld(cp->func, st0);      /* a a */ +   x87_fprndint( cp->func );	/* int(a) a*/ +   x87_fsubr(cp->func, st1, st0);    /* int(a) frc(a) */ +   x87_fxch(cp->func, st1);     /* frc(a) int(a) */ +   x87_f2xm1(cp->func);         /* (2^frc(a))-1 int(a) */ +   x87_fld1(cp->func);          /* 1 (2^frc(a))-1 int(a) */ +   x87_faddp(cp->func, st1);	/* 2^frac(a) int(a)  */ +   x87_fscale(cp->func);	/* (2^frac(a)*2^int(int(a))) int(a) */ +                                /* 2^a int(a) */ +   x87_fstp(cp->func, st1);     /* 2^a */ + +   assert( stack == cp->func->x87_stack); +       +} + +static void PIPE_CDECL print_reg( const char *msg, +                                  const float *reg ) +{ +   debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]); +} + +static void emit_print( struct aos_compilation *cp, +                        const char *message, /* must point to a static string! */ +                        unsigned file, +                        unsigned idx ) +{ +   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); +   struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx ); +   unsigned i; + +   /* There shouldn't be anything on the x87 stack.  Can add this +    * capacity later if need be. +    */ +   assert(cp->func->x87_stack == 0); + +   /* For absolute correctness, need to spill/invalidate all XMM regs +    * too.  We're obviously not concerned about performance on this +    * debug path, so here goes: +    */ +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].dirty)  +         spill(cp, i); + +      aos_release_xmm_reg(cp, i); +   } + +   /* Push caller-save (ie scratch) regs.   +    */ +   x86_cdecl_caller_push_regs( cp->func ); + + +   /* Push the arguments: +    */ +   x86_lea( cp->func, ecx, arg ); +   x86_push( cp->func, ecx ); +   x86_push_imm32( cp->func, (int)message ); + +   /* Call the helper.  Could call debug_printf directly, but +    * print_reg is a nice place to put a breakpoint if need be. +    */ +   x86_mov_reg_imm( cp->func, ecx, (int)print_reg ); +   x86_call( cp->func, ecx ); +   x86_pop( cp->func, ecx ); +   x86_pop( cp->func, ecx ); + +   /* Pop caller-save regs  +    */ +   x86_cdecl_caller_pop_regs( cp->func ); + +   /* Done...  +    */ +} + +/** + * The traditional instructions.  All operate on internal registers + * and ignore write masks and swizzling issues. + */ + +static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); +   struct x86_reg tmp = aos_get_xmm_reg(cp); + +   sse_movaps(cp->func, tmp, arg0); +   sse_mulps(cp->func, tmp, neg); +   sse_maxps(cp->func, tmp, arg0); +    +   store_dest(cp, &op->FullDstRegisters[0], tmp); +   return TRUE; +} + +static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_addps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0); +   x87_fcos(cp->func); +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + +/* The dotproduct instructions don't really do that well in sse: + */ +static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg tmp = aos_get_xmm_reg(cp);  +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_mulps(cp->func, dst, arg1); +   /* Now the hard bit: sum the first 3 values: +    */  +   sse_movhlps(cp->func, tmp, dst); +   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ +   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); +   sse_addss(cp->func, dst, tmp); +    +   aos_release_xmm_reg(cp, tmp.idx); +   store_scalar_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + + +static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg tmp = aos_get_xmm_reg(cp);       +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_mulps(cp->func, dst, arg1); +    +   /* Now the hard bit: sum the values: +    */  +   sse_movhlps(cp->func, tmp, dst); +   sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ +   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); +   sse_addss(cp->func, dst, tmp); + +   aos_release_xmm_reg(cp, tmp.idx); +   store_scalar_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg tmp = aos_get_xmm_reg(cp); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_mulps(cp->func, dst, arg1); + +   /* Now the hard bit: sum the values (from DP3): +    */  +   sse_movhlps(cp->func, tmp, dst); +   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ +   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); +   sse_addss(cp->func, dst, tmp); +   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); +   sse_addss(cp->func, dst, tmp); + +   aos_release_xmm_reg(cp, tmp.idx); +   store_scalar_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +    struct x86_reg dst = aos_get_xmm_reg(cp); +    struct x86_reg tmp = aos_get_xmm_reg(cp); +    struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + +/*    dst[0] = 1.0     * 1.0F; */ +/*    dst[1] = arg0[1] * arg1[1]; */ +/*    dst[2] = arg0[2] * 1.0; */ +/*    dst[3] = 1.0     * arg1[3]; */ + +    emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); +    emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); +    sse_mulps(cp->func, dst, tmp); + +    aos_release_xmm_reg(cp, tmp.idx); +    store_dest(cp, &op->FullDstRegisters[0], dst); +    return TRUE; +} + +static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld1(cp->func);		/* 1 */ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0 1 */ +   x87_fyl2x(cp->func);	/* log2(a0) */ +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + +static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0); +   x87_emit_ex2(cp); +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + +static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +   int i; + +   set_fpu_round_neg_inf( cp ); + +   /* Load all sources first to avoid aliasing +    */ +   for (i = 3; i >= 0; i--) { +      if (writemask & (1<<i)) { +         x87_fld_src(cp, &op->FullSrcRegisters[0], i);    +      } +   } + +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fprndint( cp->func );    +         x87_fstp(cp->func, x86_make_disp(dst, i*4)); +      } +   } + +   return TRUE; +} + + +static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +   int i; + +   set_fpu_round_nearest( cp ); + +   /* Load all sources first to avoid aliasing +    */ +   for (i = 3; i >= 0; i--) { +      if (writemask & (1<<i)) { +         x87_fld_src(cp, &op->FullSrcRegisters[0], i);    +      } +   } + +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fprndint( cp->func );    +         x87_fstp(cp->func, x86_make_disp(dst, i*4)); +      } +   } + +   return TRUE; +} + + +static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   struct x86_reg st0 = x86_make_reg(file_x87, 0); +   struct x86_reg st1 = x86_make_reg(file_x87, 1); +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +   int i; + +   set_fpu_round_neg_inf( cp ); + +   /* suck all the source values onto the stack before writing out any +    * dst, which may alias... +    */ +   for (i = 3; i >= 0; i--) { +      if (writemask & (1<<i)) { +         x87_fld_src(cp, &op->FullSrcRegisters[0], i);    +      } +   } + +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fld(cp->func, st0);     /* a a */ +         x87_fprndint( cp->func );   /* flr(a) a */ +         x87_fsubp(cp->func, st1);  /* frc(a) */ +         x87_fstp(cp->func, x86_make_disp(dst, i*4)); +      } +   } + +   return TRUE; +} + +static PIPE_CDECL void do_lit( struct aos_machine *machine, +                               float *result, +                               const float *in, +                               unsigned count ) +{ +   if (in[0] > 0)  +   { +      if (in[1] <= 0.0)  +      { +         result[0] = 1.0F; +         result[1] = in[0]; +         result[2] = 1.0; +         result[3] = 1.0F; +      } +      else +      { +         const float epsilon = 1.0F / 256.0F;     +         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); +         result[0] = 1.0F; +         result[1] = in[0]; +         result[2] = powf(in[1], exponent); +         result[3] = 1.0; +      } +   } +   else  +   { +      result[0] = 1.0F; +      result[1] = 0.0; +      result[2] = 0.0; +      result[3] = 1.0F; +   } +} + + +static PIPE_CDECL void do_lit_lut( struct aos_machine *machine, +                                   float *result, +                                   const float *in, +                                   unsigned count ) +{ +   if (in[0] > 0)  +   { +      if (in[1] <= 0.0)  +      { +         result[0] = 1.0F; +         result[1] = in[0]; +         result[2] = 1.0; +         result[3] = 1.0F; +         return; +      } +       +      if (machine->lit_info[count].shine_tab->exponent != in[3]) { +         machine->lit_info[count].func = do_lit; +         goto no_luck; +      } + +      if (in[1] <= 1.0) +      { +         const float *tab = machine->lit_info[count].shine_tab->values; +         float f = in[1] * 256; +         int k = (int)f; +         float frac = f - (float)k; +          +         result[0] = 1.0F; +         result[1] = in[0]; +         result[2] = tab[k] + frac*(tab[k+1]-tab[k]); +         result[3] = 1.0; +         return; +      } +       +   no_luck: +      { +         const float epsilon = 1.0F / 256.0F;     +         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); +         result[0] = 1.0F; +         result[1] = in[0]; +         result[2] = powf(in[1], exponent); +         result[3] = 1.0; +      } +   } +   else  +   { +      result[0] = 1.0F; +      result[1] = 0.0; +      result[2] = 0.0; +      result[3] = 1.0F; +   } +} + + + +static void PIPE_CDECL populate_lut( struct aos_machine *machine, +                                     float *result, +                                     const float *in, +                                     unsigned count ) +{ +   unsigned i, tab; + +   /* Search for an existing table for this value.  Note that without +    * static analysis we don't really know if in[3] will be constant, +    * but it usually is... +    */ +   for (tab = 0; tab < 4; tab++) { +      if (machine->shine_tab[tab].exponent == in[3]) { +         goto found; +      } +   } + +   for (tab = 0, i = 1; i < 4; i++) { +      if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used) +         tab = i; +   } + +   if (machine->shine_tab[tab].last_used == machine->now) { +      /* No unused tables (this is not a ffvertex program...).  Just +       * call pow each time: +       */ +      machine->lit_info[count].func = do_lit; +      machine->lit_info[count].func( machine, result, in, count ); +      return; +   } +   else { +      do_populate_lut( &machine->shine_tab[tab], in[3] ); +   } + + found: +   machine->shine_tab[tab].last_used = machine->now; +   machine->lit_info[count].shine_tab = &machine->shine_tab[tab]; +   machine->lit_info[count].func = do_lit_lut; +   machine->lit_info[count].func( machine, result, in, count ); +} + + + + + +static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +   unsigned lit_count = cp->lit_count++; +   struct x86_reg result, arg0; +   unsigned i; + +#if 1 +   /* For absolute correctness, need to spill/invalidate all XMM regs +    * too.   +    */ +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].dirty)  +         spill(cp, i); +      aos_release_xmm_reg(cp, i); +   } +#endif + +   if (writemask != TGSI_WRITEMASK_XYZW)  +      result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0])); +   else  +      result = get_dst_ptr(cp, &op->FullDstRegisters[0]);     + +    +   arg0 = fetch_src( cp, &op->FullSrcRegisters[0] ); +   if (arg0.file == file_XMM) { +      struct x86_reg tmp = x86_make_disp(cp->machine_EDX,  +                                         Offset(struct aos_machine, tmp[1])); +      sse_movaps( cp->func, tmp, arg0 ); +      arg0 = tmp; +   } +                   +       + +   /* Push caller-save (ie scratch) regs.   +    */ +   x86_cdecl_caller_push_regs( cp->func ); + +   /* Push the arguments: +    */ +   x86_push_imm32( cp->func, lit_count ); + +   x86_lea( cp->func, ecx, arg0 ); +   x86_push( cp->func, ecx ); + +   x86_lea( cp->func, ecx, result ); +   x86_push( cp->func, ecx ); + +   x86_push( cp->func, cp->machine_EDX ); + +   if (lit_count < MAX_LIT_INFO) { +      x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,  +                                             Offset(struct aos_machine, lit_info) +  +                                             lit_count * sizeof(struct lit_info) +  +                                             Offset(struct lit_info, func))); +   } +   else { +      x86_mov_reg_imm( cp->func, ecx, (int)do_lit ); +   } + +   x86_call( cp->func, ecx ); +             +   x86_pop( cp->func, ecx );    /* fixme... */ +   x86_pop( cp->func, ecx ); +   x86_pop( cp->func, ecx ); +   x86_pop( cp->func, ecx ); + +   x86_cdecl_caller_pop_regs( cp->func ); + +   if (writemask != TGSI_WRITEMASK_XYZW) { +      store_dest( cp,  +                  &op->FullDstRegisters[0], +                  get_xmm_writable( cp, result ) ); +   } + +   return TRUE; +} + +    +static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + +   if (writemask & TGSI_WRITEMASK_YZ) { +      struct x86_reg st1 = x86_make_reg(file_x87, 1); +      struct x86_reg st2 = x86_make_reg(file_x87, 2); + +      /* a1' = a1 <= 0 ? 1 : a1;   +       */ +      x87_fldz(cp->func);                           /* 1 0  */ +#if 1 +      x87_fld1(cp->func);                           /* 1 0  */ +#else +      /* Correct but slow due to fp exceptions generated in fyl2x - fix me. +       */ +      x87_fldz(cp->func);                           /* 1 0  */ +#endif +      x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0  */ +      x87_fcomi(cp->func, st2);	                    /* a1 1 0  */ +      x87_fcmovb(cp->func, st1);                    /* a1' 1 0  */ +      x87_fstp(cp->func, st1);                      /* a1' 0  */ +      x87_fstp(cp->func, st1);                      /* a1'  */ + +      x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1'  */ +      x87_fxch(cp->func, st1);                      /* a1' a3  */ +       + +      /* Compute pow(a1, a3) +       */ +      x87_fyl2x(cp->func);	/* a3*log2(a1)      */ +      x87_emit_ex2( cp );       /* 2^(a3*log2(a1))   */ + + +      /* a0' = max2(a0, 0): +       */ +      x87_fldz(cp->func);                           /* 0 r2 */ +      x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */ +      x87_fcomi(cp->func, st1);	 +      x87_fcmovb(cp->func, st1);                    /* a0' 0 r2 */ + +      x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */ + +      x87_fcomi(cp->func, st1);  /* a0' 0 r2 */ +      x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */ + +      x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */ +      x87_fpop(cp->func);       /* r2 */ +      x87_fpop(cp->func); +   } + +   if (writemask & TGSI_WRITEMASK_XW) { +      x87_fld1(cp->func); +      x87_fst_or_nop(cp->func, writemask, 0, dst); +      x87_fstp_or_pop(cp->func, writemask, 3, dst); +   } + +   return TRUE; +} + + + +static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_maxps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_minps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   /* potentially nothing to do */ + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_mulps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]); + +   /* If we can't clobber old contents of arg0, get a temporary & copy +    * it there, then clobber it... +    */ +   arg0 = get_xmm_writable(cp, arg0); + +   sse_mulps(cp->func, arg0, arg1); +   sse_addps(cp->func, arg0, arg2); +   store_dest(cp, &op->FullDstRegisters[0], arg0); +   return TRUE; +} + +/* Really not sufficient -- need to check for conditions that could + * generate inf/nan values, which will slow things down hugely. + */ +static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[1], 0);  /* a1.x */ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0.x a1.x */ +   x87_fyl2x(cp->func);	                                /* a1*log2(a0) */ + +   x87_emit_ex2( cp );		/* 2^(a1*log2(a0)) */ + +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + +static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   if (cp->have_sse2) { +      sse2_rcpss(cp->func, dst, arg0); +      /* extend precision here... +       */ +   } +   else { +      struct x86_reg ones = aos_get_internal(cp, IMM_ONES); +      sse_movss(cp->func, dst, ones); +      sse_divss(cp->func, dst, arg0); +   } + +   store_scalar_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +/* Although rsqrtps() and rcpps() are low precision on some/all SSE + * implementations, it is possible to improve its precision at + * fairly low cost, using a newton/raphson step, as below: + *  + * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) + * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] + * or: + *   x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)] + *  + * + * See: http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + +   if (0) { +      struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +      struct x86_reg r = aos_get_xmm_reg(cp); +      sse_rsqrtss(cp->func, r, arg0); +      store_scalar_dest(cp, &op->FullDstRegisters[0], r); +      return TRUE; +   } +   else { +      struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +      struct x86_reg r = aos_get_xmm_reg(cp); + +      struct x86_reg neg_half       = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ ); +      struct x86_reg one_point_five = x86_make_disp( neg_half, 4 ); +      struct x86_reg src            = get_xmm_writable( cp, arg0 ); +       +      sse_rsqrtss( cp->func, r, src  );             /* rsqrtss(a) */ +      sse_mulss(   cp->func, src, neg_half  );      /* -.5 * a */ +      sse_mulss(   cp->func, src,  r );             /* -.5 * a * r */ +      sse_mulss(   cp->func, src,  r );             /* -.5 * a * r * r */ +      sse_addss(   cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */ +      sse_mulss(   cp->func, r,  src );             /* r * (1.5 - .5 * a * r * r) */ + +      store_scalar_dest(cp, &op->FullDstRegisters[0], r); +      return TRUE; +   } +} + + +static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg ones = aos_get_internal(cp, IMM_ONES); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); +   sse_andps(cp->func, dst, ones); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0); +   x87_fsin(cp->func); +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + + +static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg ones = aos_get_internal(cp, IMM_ONES); +   struct x86_reg dst = get_xmm_writable(cp, arg0); +    +   sse_cmpps(cp->func, dst, arg1, cc_LessThan); +   sse_andps(cp->func, dst, ones); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = get_xmm_writable(cp, arg0); + +   sse_subps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg tmp0 = aos_get_xmm_reg(cp); +   struct x86_reg tmp1 = aos_get_xmm_reg(cp); + +   emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); +   sse_mulps(cp->func, tmp1, arg0); +   emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W)); +   sse_mulps(cp->func, tmp0, arg1); +   sse_subps(cp->func, tmp1, tmp0); +   sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W)); + +/*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ +/*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ +/*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ +/*    dst[3] is undef */ + + +   aos_release_xmm_reg(cp, tmp0.idx); +   store_dest(cp, &op->FullDstRegisters[0], tmp1); +   return TRUE; +} + + + +static boolean +emit_instruction( struct aos_compilation *cp, +                  struct tgsi_full_instruction *inst ) +{ +   x87_assert_stack_empty(cp->func); + +   switch( inst->Instruction.Opcode ) { +   case TGSI_OPCODE_MOV: +      return emit_MOV( cp, inst ); + +   case TGSI_OPCODE_LIT: +      return emit_LIT(cp, inst); + +   case TGSI_OPCODE_RCP: +      return emit_RCP(cp, inst); + +   case TGSI_OPCODE_RSQ: +      return emit_RSQ(cp, inst); + +   case TGSI_OPCODE_EXP: +      /*return emit_EXP(cp, inst);*/ +      return FALSE; + +   case TGSI_OPCODE_LOG: +      /*return emit_LOG(cp, inst);*/ +      return FALSE; + +   case TGSI_OPCODE_MUL: +      return emit_MUL(cp, inst); + +   case TGSI_OPCODE_ADD: +      return emit_ADD(cp, inst); + +   case TGSI_OPCODE_DP3: +      return emit_DP3(cp, inst); + +   case TGSI_OPCODE_DP4: +      return emit_DP4(cp, inst); + +   case TGSI_OPCODE_DST: +      return emit_DST(cp, inst); + +   case TGSI_OPCODE_MIN: +      return emit_MIN(cp, inst); + +   case TGSI_OPCODE_MAX: +      return emit_MAX(cp, inst); + +   case TGSI_OPCODE_SLT: +      return emit_SLT(cp, inst); + +   case TGSI_OPCODE_SGE: +      return emit_SGE(cp, inst); + +   case TGSI_OPCODE_MAD: +      return emit_MAD(cp, inst); + +   case TGSI_OPCODE_SUB: +      return emit_SUB(cp, inst); +  +   case TGSI_OPCODE_LERP: +//      return emit_LERP(cp, inst); +      return FALSE; + +   case TGSI_OPCODE_FRAC: +      return emit_FRC(cp, inst); + +   case TGSI_OPCODE_CLAMP: +//      return emit_CLAMP(cp, inst); +      return FALSE; + +   case TGSI_OPCODE_FLOOR: +      return emit_FLR(cp, inst); + +   case TGSI_OPCODE_ROUND: +      return emit_RND(cp, inst); + +   case TGSI_OPCODE_EXPBASE2: +      return emit_EX2(cp, inst); + +   case TGSI_OPCODE_LOGBASE2: +      return emit_LG2(cp, inst); + +   case TGSI_OPCODE_POWER: +      return emit_POW(cp, inst); + +   case TGSI_OPCODE_CROSSPRODUCT: +      return emit_XPD(cp, inst); + +   case TGSI_OPCODE_ABS: +      return emit_ABS(cp, inst); + +   case TGSI_OPCODE_DPH: +      return emit_DPH(cp, inst); + +   case TGSI_OPCODE_COS: +      return emit_COS(cp, inst); + +   case TGSI_OPCODE_SIN: +      return emit_SIN(cp, inst); + +   case TGSI_OPCODE_END: +      return TRUE; + +   default: +      return FALSE; +   } +} + + +static boolean emit_viewport( struct aos_compilation *cp ) +{ +   struct x86_reg pos = aos_get_shader_reg_xmm(cp,  +                                               TGSI_FILE_OUTPUT,  +                                               0); + +   struct x86_reg scale = x86_make_disp(cp->machine_EDX,  +                                        Offset(struct aos_machine, scale)); + +   struct x86_reg translate = x86_make_disp(cp->machine_EDX,  +                                        Offset(struct aos_machine, translate)); + +   sse_mulps(cp->func, pos, scale); +   sse_addps(cp->func, pos, translate); + +   aos_adopt_xmm_reg( cp, +                      pos, +                      TGSI_FILE_OUTPUT, +                      0, +                      TRUE ); +   return TRUE; +} + + +/* This is useful to be able to see the results on softpipe.  Doesn't + * do proper clipping, just assumes the backend can do it during + * rasterization -- for debug only... + */ +static boolean emit_rhw_viewport( struct aos_compilation *cp ) +{ +   struct x86_reg tmp = aos_get_xmm_reg(cp); +   struct x86_reg pos = aos_get_shader_reg_xmm(cp,  +                                               TGSI_FILE_OUTPUT,  +                                               0); + +   struct x86_reg scale = x86_make_disp(cp->machine_EDX,  +                                        Offset(struct aos_machine, scale)); + +   struct x86_reg translate = x86_make_disp(cp->machine_EDX,  +                                        Offset(struct aos_machine, translate)); + + + +   emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W)); +   sse2_rcpss(cp->func, tmp, tmp); +   sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X)); +    +   sse_mulps(cp->func, pos, scale); +   sse_mulps(cp->func, pos, tmp); +   sse_addps(cp->func, pos, translate); + +   /* Set pos[3] = w  +    */ +   mask_write(cp, pos, tmp, TGSI_WRITEMASK_W); + +   aos_adopt_xmm_reg( cp, +                      pos, +                      TGSI_FILE_OUTPUT, +                      0, +                      TRUE ); +   return TRUE; +} + + +static boolean note_immediate( struct aos_compilation *cp, +                               struct tgsi_full_immediate *imm ) +{ +   unsigned pos = cp->num_immediates++; +   unsigned j; + +   for (j = 0; j < imm->Immediate.Size; j++) { +      cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float; +   } + +   return TRUE; +} + + + + +static void find_last_write_outputs( struct aos_compilation *cp ) +{ +   struct tgsi_parse_context parse; +   unsigned this_instruction = 0; +   unsigned i; + +   tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); + +   while (!tgsi_parse_end_of_tokens( &parse )) { +       +      tgsi_parse_token( &parse ); + +      if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)  +         continue; + +      for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { +         if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File == +             TGSI_FILE_OUTPUT)  +         { +            unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index; +            cp->output_last_write[idx] = this_instruction; +         } +      } + +      this_instruction++; +   } + +   tgsi_parse_free( &parse ); +} + + +#define ARG_VARIENT    1 +#define ARG_START_ELTS 2 +#define ARG_COUNT      3 +#define ARG_OUTBUF     4 + + +static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, +                                     boolean linear ) +{  +   struct tgsi_parse_context parse; +   struct aos_compilation cp; +   unsigned fixup, label; + +   tgsi_parse_init( &parse, varient->base.vs->state.tokens ); + +   memset(&cp, 0, sizeof(cp)); + +   cp.insn_counter = 1; +   cp.vaos = varient; +   cp.have_sse2 = 1; +   cp.func = &varient->func[ linear ? 0 : 1 ]; + +   cp.tmp_EAX       = x86_make_reg(file_REG32, reg_AX); +   cp.idx_EBX      = x86_make_reg(file_REG32, reg_BX); +   cp.outbuf_ECX    = x86_make_reg(file_REG32, reg_CX); +   cp.machine_EDX   = x86_make_reg(file_REG32, reg_DX); +   cp.count_ESI     = x86_make_reg(file_REG32, reg_SI); + +   x86_init_func(cp.func); + +   find_last_write_outputs(&cp); + +   x86_push(cp.func, cp.idx_EBX); +   x86_push(cp.func, cp.count_ESI); + + +   /* Load arguments into regs: +    */ +   x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_VARIENT)); +   x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); +   x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); +   x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); + + +   /* Compare count to zero and possibly bail. +    */ +   x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); +   x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); +   fixup = x86_jcc_forward(cp.func, cc_E); + +   /* Dig out the machine pointer from inside the varient arg  +    */ +   x86_mov(cp.func, cp.machine_EDX,  +           x86_make_disp(cp.machine_EDX, +                         Offset( struct draw_vs_varient_aos_sse, machine ))); + +   save_fpu_state( &cp ); +   set_fpu_round_nearest( &cp ); + +   /* Note address for loop jump  +    */ +   label = x86_get_label(cp.func); +   { +      /* Fetch inputs...  TODO:  fetch lazily... +       */ +      if (!aos_fetch_inputs( &cp, linear )) +         goto fail; + +      /* Emit the shader: +       */ +      while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )  +      { +         tgsi_parse_token( &parse ); + +         switch (parse.FullToken.Token.Type) { +         case TGSI_TOKEN_TYPE_IMMEDIATE: +            if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) +               goto fail; +            break; + +         case TGSI_TOKEN_TYPE_INSTRUCTION: +            if (DISASSEM) +               tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter ); + +            if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) +               goto fail; +            break; +         } + +         x87_assert_stack_empty(cp.func); +         cp.insn_counter++; + +         if (DISASSEM) +            debug_printf("\n"); +      } + +    +      { +         unsigned i; +         for (i = 0; i < 8; i++) { +            if (cp.xmm[i].file != TGSI_FILE_OUTPUT) { +               cp.xmm[i].file = TGSI_FILE_NULL; +               cp.xmm[i].dirty = 0; +            } +         } +      } + +      if (cp.error) +         goto fail; + +      if (cp.vaos->base.key.clip) { +         /* not really handling clipping, just do the rhw so we can +          * see the results... +          */ +         emit_rhw_viewport(&cp);  +      } +      else if (cp.vaos->base.key.viewport) { +         emit_viewport(&cp); +      } + +      /* Emit output...  TODO: do this eagerly after the last write to a +       * given output. +       */ +      if (!aos_emit_outputs( &cp )) +         goto fail; + + +      /* Next vertex: +       */ +      x86_lea(cp.func,  +              cp.outbuf_ECX,  +              x86_make_disp(cp.outbuf_ECX,  +                            cp.vaos->base.key.output_stride)); + +      /* Incr index +       */    +      if (linear) { +         x86_inc(cp.func, cp.idx_EBX); +      }  +      else { +         x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4)); +      } + +   } +   /* decr count, loop if not zero +    */ +   x86_dec(cp.func, cp.count_ESI); +   x86_jcc(cp.func, cc_NZ, label); + +   restore_fpu_state(&cp); + +   /* Land forward jump here: +    */ +   x86_fixup_fwd_jump(cp.func, fixup); + +   /* Exit mmx state? +    */ +   if (cp.func->need_emms) +      mmx_emms(cp.func); + +   x86_pop(cp.func, cp.count_ESI); +   x86_pop(cp.func, cp.idx_EBX); + +   x87_assert_stack_empty(cp.func); +   x86_ret(cp.func); + +   tgsi_parse_free( &parse ); +   return !cp.error; + + fail: +   tgsi_parse_free( &parse ); +   return FALSE; +} + + + +static void vaos_set_buffer( struct draw_vs_varient *varient, +                             unsigned buf, +                             const void *ptr, +                             unsigned stride ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; +   unsigned i; + +   for (i = 0; i < vaos->base.key.nr_inputs; i++) { +      if (vaos->base.key.element[i].in.buffer == buf) { +         vaos->machine->attrib[i].input_ptr = ((char *)ptr + +                                               vaos->base.key.element[i].in.offset); +         vaos->machine->attrib[i].input_stride = stride; +      } +   } +} + + +static void vaos_destroy( struct draw_vs_varient *varient ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   if (vaos->machine) +      align_free( vaos->machine ); + +   x86_release_func( &vaos->func[0] ); +   x86_release_func( &vaos->func[1] ); + +   FREE(vaos); +} + +static void vaos_run_elts( struct draw_vs_varient *varient, +                           const unsigned *elts, +                           unsigned count, +                           void *output_buffer ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   vaos->machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; +   vaos->gen_run_elts( varient, +                       elts, +                       count, +                       output_buffer ); +} + +static void vaos_run_linear( struct draw_vs_varient *varient, +                             unsigned start, +                             unsigned count, +                             void *output_buffer ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   vaos->machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; +   vaos->gen_run_linear( varient, +                         start, +                         count, +                         output_buffer ); +} + + +static void vaos_set_constants( struct draw_vs_varient *varient, +                                const float (*constants)[4] ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   memcpy(vaos->machine->constant, +          constants, +          (vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1) * 4 * sizeof(float)); + +#if 0 +   unsigned i; +   for (i =0; i < vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1; i++) +      debug_printf("state %d: %f %f %f %f\n", +                   i,  +                   constants[i][0], +                   constants[i][1], +                   constants[i][2], +                   constants[i][3]); +#endif + +   { +      unsigned i; +      for (i = 0; i < MAX_LIT_INFO; i++) { +         vaos->machine->lit_info[i].func = populate_lut; +         vaos->machine->now++; +      } +   } +} + + +static void vaos_set_viewport( struct draw_vs_varient *varient, +                               const struct pipe_viewport_state *viewport ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   memcpy(vaos->machine->scale, viewport->scale, 4 * sizeof(float)); +   memcpy(vaos->machine->translate, viewport->translate, 4 * sizeof(float)); +} + + + +static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ) +{ +   struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); + +   if (!vaos) +      goto fail; +    +   vaos->base.key = *key; +   vaos->base.vs = vs; +   vaos->base.set_input = vaos_set_buffer; +   vaos->base.set_constants = vaos_set_constants; +   vaos->base.set_viewport = vaos_set_viewport; +   vaos->base.destroy = vaos_destroy; +   vaos->base.run_linear = vaos_run_linear; +   vaos->base.run_elts = vaos_run_elts; + +   vaos->draw = vs->draw; +   vaos->machine = align_malloc( sizeof(struct aos_machine), 16 ); +   if (!vaos->machine) +      goto fail; +    +   memset(vaos->machine, 0, sizeof(struct aos_machine)); +   init_internals(vaos->machine); + +   tgsi_dump(vs->state.tokens, 0); + +   if (!build_vertex_program( vaos, TRUE )) +      goto fail; + +   if (!build_vertex_program( vaos, FALSE )) +      goto fail; + +   vaos->gen_run_linear = (vsv_run_linear_func)x86_get_func(&vaos->func[0]); +   if (!vaos->gen_run_linear) +      goto fail; + +   vaos->gen_run_elts = (vsv_run_elts_func)x86_get_func(&vaos->func[1]); +   if (!vaos->gen_run_elts) +      goto fail; + +   return &vaos->base; + + fail: +   if (vaos->machine) +      align_free( vaos->machine ); + +   if (vaos) +      x86_release_func( &vaos->func[0] ); + +   if (vaos) +      x86_release_func( &vaos->func[1] ); + +   FREE(vaos); +    +   return NULL; +} + + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ) +{ +   struct draw_vs_varient *varient = varient_aos_sse( vs, key ); + +   if (varient == NULL) { +      assert(0); +      varient = draw_vs_varient_generic( vs, key ); +   } + +   return varient; +} + + + +#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h new file mode 100644 index 0000000000..b47413ff43 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -0,0 +1,222 @@ +/************************************************************************** + *  + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + +/* Authors:  Keith Whitwell <keith@tungstengraphics.com> + */ + +#ifndef DRAW_VS_AOS_H +#define DRAW_VS_AOS_H + + +struct tgsi_token; +struct x86_function; + +#include "pipe/p_state.h" +#include "rtasm/rtasm_x86sse.h" + + + + + +#define X    0 +#define Y    1 +#define Z    2 +#define W    3 + +#define MAX_INPUTS     PIPE_MAX_ATTRIBS +#define MAX_OUTPUTS    PIPE_MAX_ATTRIBS +#define MAX_TEMPS      PIPE_MAX_ATTRIBS /* say */ +#define MAX_CONSTANTS  PIPE_MAX_ATTRIBS /* say */ +#define MAX_IMMEDIATES PIPE_MAX_ATTRIBS /* say */ +#define MAX_INTERNALS  8 + +#define AOS_FILE_INTERNAL TGSI_FILE_COUNT + +#define FPU_RND_NEG    1 +#define FPU_RND_NEAREST 2 + +struct aos_machine; +typedef void PIPE_CDECL (*lit_func)( struct aos_machine *, +                                    float *result, +                                    const float *in, +                                    unsigned count ); +struct shine_tab { +   float exponent; +   float values[258]; +   unsigned last_used; +}; + +struct lit_info { +   lit_func func; +   struct shine_tab *shine_tab; +}; + +#define MAX_SHINE_TAB    4 +#define MAX_LIT_INFO     16 + +/* This is the temporary storage used by all the aos_sse vs varients. + * Create one per context and reuse by passing a pointer in at + * vs_varient creation?? + */ +struct aos_machine { +   float input    [MAX_INPUTS    ][4]; +   float output   [MAX_OUTPUTS   ][4]; +   float temp     [MAX_TEMPS     ][4]; +   float constant [MAX_CONSTANTS ][4]; /* fixme -- should just be a pointer */ +   float immediate[MAX_IMMEDIATES][4]; /* fixme -- should just be a pointer */ +   float internal [MAX_INTERNALS ][4]; + +   float scale[4];              /* viewport */ +   float translate[4];          /* viewport */ + +   float tmp[2][4];             /* scratch space for LIT */ + +   struct shine_tab shine_tab[MAX_SHINE_TAB]; +   struct lit_info  lit_info[MAX_LIT_INFO]; +   unsigned now; +    + +   ushort fpu_rnd_nearest; +   ushort fpu_rnd_neg_inf; +   ushort fpu_restore; +   ushort fpucntl;              /* one of FPU_* above */ + +   struct { +      const void *input_ptr; +      unsigned input_stride; + +      unsigned output_offset; +   } attrib[PIPE_MAX_ATTRIBS]; +}; + + + + +struct aos_compilation { +   struct x86_function *func; +   struct draw_vs_varient_aos_sse *vaos; + +   unsigned insn_counter; +   unsigned num_immediates; +   unsigned count; +   unsigned lit_count; + +   struct { +      unsigned idx:16; +      unsigned file:8; +      unsigned dirty:8; +      unsigned last_used; +   } xmm[8]; + + +   boolean input_fetched[PIPE_MAX_ATTRIBS]; +   unsigned output_last_write[PIPE_MAX_ATTRIBS]; + +   boolean have_sse2; +   boolean error; +   short fpucntl; + +   /* these are actually known values, but putting them in a struct +    * like this is helpful to keep them in sync across the file. +    */ +   struct x86_reg tmp_EAX; +   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */ +   struct x86_reg outbuf_ECX; +   struct x86_reg machine_EDX; +   struct x86_reg count_ESI;    /* decrements to zero */ +}; + +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ); +void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx ); + +void aos_adopt_xmm_reg( struct aos_compilation *cp, +                        struct x86_reg reg, +                        unsigned file, +                        unsigned idx, +                        unsigned dirty ); + +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,  +                                   unsigned file, +                                   unsigned idx ); + +boolean aos_fetch_inputs( struct aos_compilation *cp, +                          boolean linear ); + +boolean aos_emit_outputs( struct aos_compilation *cp ); + + +#define IMM_ONES     0              /* 1, 1,1,1 */ +#define IMM_SWZ      1              /* 1,-1,0, 0xffffffff */ +#define IMM_IDENTITY 2              /* 0, 0,0,1 */ +#define IMM_INV_255  3              /* 1/255, 1/255, 1/255, 1/255 */ +#define IMM_255      4              /* 255, 255, 255, 255 */ +#define IMM_NEGS     5              /* -1,-1,-1,-1 */ +#define IMM_RSQ      6              /* -.5,1.5,_,_ */ +#define IMM_PSIZE    7              /* not really an immediate - updated each run */ + +struct x86_reg aos_get_internal( struct aos_compilation *cp, +                                 unsigned imm ); +struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, +                                     unsigned imm ); + + +#define ERROR(cp, msg)                                                  \ +do {                                                                    \ +   debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \ +   cp->error = 1;                                                       \ +   assert(0);                                                           \ +} while (0) + + + + + + +struct draw_vs_varient_aos_sse { +   struct draw_vs_varient base; +   struct draw_context *draw; + +#if 0 +   struct { +      const void *ptr; +      unsigned stride; +   } attrib[PIPE_MAX_ATTRIBS]; +#endif + +   struct aos_machine *machine; /* XXX: temporarily unshared */ + +   vsv_run_linear_func gen_run_linear; +   vsv_run_elts_func gen_run_elts; + + +   struct x86_function func[2]; +}; + + + +#endif  + diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c new file mode 100644 index 0000000000..836110f382 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -0,0 +1,326 @@ +/************************************************************************** + *  + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/util/tgsi_parse.h" +#include "tgsi/util/tgsi_util.h" +#include "tgsi/exec/tgsi_exec.h" +#include "draw_vs.h" +#include "draw_vs_aos.h" +#include "draw_vertex.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 + +/* Note - don't yet have to worry about interacting with the code in + * draw_vs_aos.c as there is no intermingling of generated code... + * That may have to change, we'll see. + */ +static void emit_load_R32G32B32A32( struct aos_compilation *cp, 			    +				    struct x86_reg data, +				    struct x86_reg src_ptr ) +{ +   sse_movups(cp->func, data, src_ptr); +} + +static void emit_load_R32G32B32( struct aos_compilation *cp, 			    +				 struct x86_reg data, +				 struct x86_reg src_ptr ) +{ +   sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); +   sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); +   sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) ); +   sse_movlps(cp->func, data, src_ptr); +} + +static void emit_load_R32G32( struct aos_compilation *cp,  +			   struct x86_reg data, +			   struct x86_reg src_ptr ) +{ +   sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); +   sse_movlps(cp->func, data, src_ptr); +} + + +static void emit_load_R32( struct aos_compilation *cp,  +			   struct x86_reg data, +			   struct x86_reg src_ptr ) +{ +   sse_movss(cp->func, data, src_ptr); +   sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); +} + + +static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, +				       struct x86_reg data, +				       struct x86_reg src_ptr ) +{ +   sse_movss(cp->func, data, src_ptr); +   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); +   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); +   sse2_cvtdq2ps(cp->func, data, data); +   sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255)); +} + + + +static void get_src_ptr( struct x86_function *func, +                         struct x86_reg src, +                         struct x86_reg machine, +                         struct x86_reg elt, +                         unsigned a ) +{ +   struct x86_reg input_ptr =  +      x86_make_disp(machine,  +		    Offset(struct aos_machine, attrib[a].input_ptr)); + +   struct x86_reg input_stride =  +      x86_make_disp(machine,  +		    Offset(struct aos_machine, attrib[a].input_stride)); + +   /* Calculate pointer to current attrib: +    */ +   x86_mov(func, src, input_stride); +   x86_imul(func, src, elt); +   x86_add(func, src, input_ptr); +} + + +/* Extended swizzles?  Maybe later. + */   +static void emit_swizzle( struct aos_compilation *cp, +			  struct x86_reg dest, +			  struct x86_reg src, +			  unsigned shuffle ) +{ +   sse_shufps(cp->func, dest, src, shuffle); +} + + +static boolean load_input( struct aos_compilation *cp, +                           unsigned idx, +                           boolean linear ) +{ +   unsigned format = cp->vaos->base.key.element[idx].in.format; +   struct x86_reg src = cp->tmp_EAX; +   struct x86_reg dataXMM = aos_get_xmm_reg(cp); + +   /* Figure out source pointer address: +    */ +   get_src_ptr(cp->func,  +               src,  +               cp->machine_EDX,  +               linear ? cp->idx_EBX : x86_deref(cp->idx_EBX), +               idx); + +   src = x86_deref(src); + +   aos_adopt_xmm_reg( cp, +                      dataXMM, +                      TGSI_FILE_INPUT, +                      idx, +                      TRUE ); + +   switch (format) { +   case PIPE_FORMAT_R32_FLOAT: +      emit_load_R32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_R32G32_FLOAT: +      emit_load_R32G32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_R32G32B32_FLOAT: +      emit_load_R32G32B32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_R32G32B32A32_FLOAT: +      emit_load_R32G32B32A32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_B8G8R8A8_UNORM: +      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); +      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); +      break; +   case PIPE_FORMAT_R8G8B8A8_UNORM: +      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); +      break; +   default: +      ERROR(cp, "unhandled input format"); +      return FALSE; +   } + +   return TRUE; +} + + +boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) +{ +   unsigned i; +    +   for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) { +      if (!load_input( cp, i, linear )) +         return FALSE; +      cp->insn_counter++; +      debug_printf("\n"); +   } + +   return TRUE; +} + + + + + + + +static void emit_store_R32G32B32A32( struct aos_compilation *cp, 			    +				     struct x86_reg dst_ptr, +				     struct x86_reg dataXMM ) +{ +   sse_movups(cp->func, dst_ptr, dataXMM); +} + +static void emit_store_R32G32B32( struct aos_compilation *cp,  +				  struct x86_reg dst_ptr, +				  struct x86_reg dataXMM ) +{ +   sse_movlps(cp->func, dst_ptr, dataXMM); +   sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ +   sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM); +} + +static void emit_store_R32G32( struct aos_compilation *cp,  +			       struct x86_reg dst_ptr, +			       struct x86_reg dataXMM ) +{ +   sse_movlps(cp->func, dst_ptr, dataXMM); +} + +static void emit_store_R32( struct aos_compilation *cp,  +			    struct x86_reg dst_ptr, +			    struct x86_reg dataXMM ) +{ +   sse_movss(cp->func, dst_ptr, dataXMM); +} + + + +static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp, +				       struct x86_reg dst_ptr, +				       struct x86_reg dataXMM ) +{ +   sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255)); +   sse2_cvtps2dq(cp->func, dataXMM, dataXMM); +   sse2_packssdw(cp->func, dataXMM, dataXMM); +   sse2_packuswb(cp->func, dataXMM, dataXMM); +   sse_movss(cp->func, dst_ptr, dataXMM); +} + + + + + +static boolean emit_output( struct aos_compilation *cp, +                            struct x86_reg ptr, +                            struct x86_reg dataXMM,  +                            unsigned format ) +{ +   switch (format) { +   case EMIT_1F: +   case EMIT_1F_PSIZE: +      emit_store_R32(cp, ptr, dataXMM); +      break; +   case EMIT_2F: +      emit_store_R32G32(cp, ptr, dataXMM); +      break; +   case EMIT_3F: +      emit_store_R32G32B32(cp, ptr, dataXMM); +      break; +   case EMIT_4F: +      emit_store_R32G32B32A32(cp, ptr, dataXMM); +      break; +   case EMIT_4UB: +      if (1) { +         emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); +         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); +      } +      else { +         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); +      } +      break; +   default: +      ERROR(cp, "unhandled output format"); +      return FALSE; +   } + +   return TRUE; +} + + + +boolean aos_emit_outputs( struct aos_compilation *cp ) +{ +   unsigned i; +    +   for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) { +      unsigned format = cp->vaos->base.key.element[i].out.format; +      unsigned offset = cp->vaos->base.key.element[i].out.offset; +      unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output; + +      struct x86_reg data; + +      if (format == EMIT_1F_PSIZE) { +         data = aos_get_internal_xmm( cp, IMM_PSIZE ); +      } +      else { +         data = aos_get_shader_reg( cp,  +                                    TGSI_FILE_OUTPUT, +                                    vs_output ); +      } + +      if (data.file != file_XMM) { +         struct x86_reg tmp = aos_get_xmm_reg( cp ); +         sse_movups(cp->func, tmp, data); +         data = tmp; +      } +       +      if (!emit_output( cp,  +                        x86_make_disp( cp->outbuf_ECX, offset ), +                        data,  +                        format )) +         return FALSE; + +      aos_release_xmm_reg( cp, data.idx ); + +      cp->insn_counter++; +      debug_printf("\n"); +   } + +   return TRUE; +} + +#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index 7a02f6334b..4501877efc 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -179,10 +179,12 @@ draw_create_vs_exec(struct draw_context *draw,     tgsi_scan_shader(state->tokens, &vs->base.info); +   vs->base.draw = draw;     vs->base.prepare = vs_exec_prepare;     vs->base.run_linear = vs_exec_run_linear;     vs->base.delete = vs_exec_delete; -   vs->machine = &draw->machine; +   vs->base.create_varient = draw_vs_varient_generic; +   vs->machine = &draw->vs.machine;     return &vs->base;  } diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c index 171da51dd5..621472ec7c 100644 --- a/src/gallium/auxiliary/draw/draw_vs_llvm.c +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -114,7 +114,9 @@ draw_create_vs_llvm(struct draw_context *draw,     tgsi_scan_shader(vs->base.state.tokens, &vs->base.info); +   vs->base.draw = draw;     vs->base.prepare = vs_llvm_prepare; +   vs->base.create_varient = draw_vs_varient_generic;     vs->base.run_linear = vs_llvm_run_linear;     vs->base.delete = vs_llvm_delete;     vs->machine = &draw->machine; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index e3f4e67472..7781782ae8 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -47,9 +47,7 @@  #include "tgsi/util/tgsi_parse.h"  #define SSE_MAX_VERTICES 4 -#define SSE_SWIZZLES 1 -#if SSE_SWIZZLES  typedef void (XSTDCALL *codegen_function) (     const struct tgsi_exec_vector *input, /* 1 */     struct tgsi_exec_vector *output, /* 2 */ @@ -62,14 +60,6 @@ typedef void (XSTDCALL *codegen_function) (     float (*aos_output)[4],      /* 9 */     uint num_outputs,            /* 10 */     uint output_stride );        /* 11 */ -#else -typedef void (XSTDCALL *codegen_function) ( -   const struct tgsi_exec_vector *input, -   struct tgsi_exec_vector *output, -   float (*constant)[4], -   struct tgsi_exec_vector *temporary, -   float (*immediates)[4] ); -#endif  struct draw_sse_vertex_shader {     struct draw_vertex_shader base; @@ -111,7 +101,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base,     for (i = 0; i < count; i += MAX_TGSI_VERTICES) {        unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i); -#if SSE_SWIZZLES        /* run compiled shader         */        shader->func(machine->Inputs, @@ -128,43 +117,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base,        input = (const float (*)[4])((const char *)input + input_stride * max_vertices);        output = (float (*)[4])((char *)output + output_stride * max_vertices); -#else -      unsigned int j, slot; - -      /* Swizzle inputs.   -       */ -      for (j = 0; j < max_vertices; j++) { -         for (slot = 0; slot < base->info.num_inputs; slot++) { -            machine->Inputs[slot].xyzw[0].f[j] = input[slot][0]; -            machine->Inputs[slot].xyzw[1].f[j] = input[slot][1]; -            machine->Inputs[slot].xyzw[2].f[j] = input[slot][2]; -            machine->Inputs[slot].xyzw[3].f[j] = input[slot][3]; -         }  - -	 input = (const float (*)[4])((const char *)input + input_stride); -      } - -      /* run compiled shader -       */ -      shader->func(machine->Inputs, -		   machine->Outputs, -		   (float (*)[4])constants, -		   machine->Temps, -		   shader->immediates); - -      /* Unswizzle all output results.   -       */ -      for (j = 0; j < max_vertices; j++) { -         for (slot = 0; slot < base->info.num_outputs; slot++) { -            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; -            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; -            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; -            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; -         }  - -	 output = (float (*)[4])((char *)output + output_stride); -      } -#endif     }  } @@ -203,15 +155,18 @@ draw_create_vs_sse(struct draw_context *draw,     tgsi_scan_shader(templ->tokens, &vs->base.info); +   vs->base.draw = draw; +   vs->base.create_varient = draw_vs_varient_aos_sse; +//   vs->base.create_varient = draw_vs_varient_generic;     vs->base.prepare = vs_sse_prepare;     vs->base.run_linear = vs_sse_run_linear;     vs->base.delete = vs_sse_delete; -   vs->machine = &draw->machine; +   vs->machine = &draw->vs.machine;     x86_init_func( &vs->sse2_program );     if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens, -			&vs->sse2_program, vs->immediates, SSE_SWIZZLES ))  +			&vs->sse2_program, vs->immediates, TRUE ))         goto fail;     vs->func = (codegen_function) x86_get_func( &vs->sse2_program ); diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c new file mode 100644 index 0000000000..119a3a04b5 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -0,0 +1,326 @@ +/************************************************************************** + *  + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + /* +  * Authors: +  *   Keith Whitwell <keith@tungstengraphics.com> +  */ + +#include "pipe/p_util.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_vbuf.h" +#include "draw/draw_vertex.h" +#include "draw/draw_vs.h" +#include "translate/translate.h" +#include "translate/translate_cache.h" + +/* A first pass at incorporating vertex fetch/emit functionality into  + */ +struct draw_vs_varient_generic { +   struct draw_vs_varient base; + +   struct pipe_viewport_state viewport; + +   struct draw_vertex_shader *shader; +   struct draw_context *draw; +    +   /* Basic plan is to run these two translate functions before/after +    * the vertex shader's existing run_linear() routine to simulate +    * the inclusion of this functionality into the shader...   +    *  +    * Next will look at actually including it. +    */ +   struct translate *fetch; +   struct translate *emit; + +   const float (*constants)[4]; +}; + + + + +static void vsvg_set_constants( struct draw_vs_varient *varient, +                                const float (*constants)[4] ) +{ +   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; + +   vsvg->constants = constants; +} + + +static void vsvg_set_input( struct draw_vs_varient *varient, +                            unsigned buffer, +                            const void *ptr, +                            unsigned stride ) +{ +   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; + +   vsvg->fetch->set_buffer(vsvg->fetch,  +                           buffer,  +                           ptr,  +                           stride); +} + + +/* Mainly for debug at this stage: + */ +static void do_rhw_viewport( struct draw_vs_varient_generic *vsvg, +                             unsigned count, +                             void *output_buffer ) +{ +   char *ptr = (char *)output_buffer; +   const float *scale = vsvg->viewport.scale; +   const float *trans = vsvg->viewport.translate; +   unsigned stride = vsvg->base.key.output_stride; +   unsigned j; + +   for (j = 0; j < count; j++, ptr += stride) { +      float *data = (float *)ptr; +      float w = 1.0f / data[3]; + +      data[0] = data[0] * w * scale[0] + trans[0]; +      data[1] = data[1] * w * scale[1] + trans[1]; +      data[2] = data[2] * w * scale[2] + trans[2]; +      data[3] = w; +   } +} + +static void do_viewport( struct draw_vs_varient_generic *vsvg, +                             unsigned count, +                             void *output_buffer ) +{ +   char *ptr = (char *)output_buffer; +   const float *scale = vsvg->viewport.scale; +   const float *trans = vsvg->viewport.translate; +   unsigned stride = vsvg->base.key.output_stride; +   unsigned j; + +   for (j = 0; j < count; j++, ptr += stride) { +      float *data = (float *)ptr; + +      data[0] = data[0] * scale[0] + trans[0]; +      data[1] = data[1] * scale[1] + trans[1]; +      data[2] = data[2] * scale[2] + trans[2]; +   } +} +                          + +static void vsvg_run_elts( struct draw_vs_varient *varient, +                           const unsigned *elts, +                           unsigned count, +                           void *output_buffer) +{ +   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; +			 +   /* Want to do this in small batches for cache locality? +    */ +    +   vsvg->fetch->run_elts( vsvg->fetch,  +                          elts, +                          count, +                          output_buffer ); + +   //if (!vsvg->base.vs->is_passthrough)  +   { +      vsvg->base.vs->run_linear( vsvg->base.vs,  +                                 output_buffer, +                                 output_buffer, +                                 vsvg->constants, +                                 count, +                                 vsvg->base.key.output_stride,  +                                 vsvg->base.key.output_stride); + + +      if (vsvg->base.key.clip) { +         /* not really handling clipping, just do the rhw so we can +          * see the results... +          */ +         do_rhw_viewport( vsvg, +                          count, +                          output_buffer ); +      } +      else if (vsvg->base.key.viewport) { +         do_viewport( vsvg, +                      count, +                      output_buffer ); +      } + + +      //if (!vsvg->already_in_emit_format) + +      vsvg->emit->set_buffer( vsvg->emit, +                              0,  +                              output_buffer, +                              vsvg->base.key.output_stride ); + + +      vsvg->emit->run( vsvg->emit, +                       0, count, +                       output_buffer ); +   } +} + + +static void vsvg_run_linear( struct draw_vs_varient *varient, +                                   unsigned start, +                                   unsigned count, +                                   void *output_buffer ) +{ +   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; +	 +   //debug_printf("%s %d %d\n", __FUNCTION__, start, count); +    +				 +   vsvg->fetch->run( vsvg->fetch,  +                     start, +                     count, +                     output_buffer ); + +   //if (!vsvg->base.vs->is_passthrough)  +   { +      vsvg->base.vs->run_linear( vsvg->base.vs,  +                                 output_buffer, +                                 output_buffer, +                                 vsvg->constants, +                                 count, +                                 vsvg->base.key.output_stride,  +                                 vsvg->base.key.output_stride); + +      if (vsvg->base.key.clip) { +         /* not really handling clipping, just do the rhw so we can +          * see the results... +          */ +         do_rhw_viewport( vsvg, +                          count, +                          output_buffer ); +      } +      else if (vsvg->base.key.viewport) { +         do_viewport( vsvg, +                      count, +                      output_buffer ); +      } + +      //if (!vsvg->already_in_emit_format) +      vsvg->emit->set_buffer( vsvg->emit, +                              0,  +                              output_buffer, +                              vsvg->base.key.output_stride ); + +      vsvg->emit->set_buffer( vsvg->emit,  +                              1, +                              &vsvg->draw->rasterizer->point_size, +                              0); + +      vsvg->emit->run( vsvg->emit, +                       0, count, +                       output_buffer ); +   } +} + + + + +static void vsvg_set_viewport( struct draw_vs_varient *varient, +                               const struct pipe_viewport_state *viewport ) +{ +   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; + +   vsvg->viewport = *viewport; +} + +static void vsvg_destroy( struct draw_vs_varient *varient ) +{ +   FREE(varient); +} + + +struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ) +{ +   unsigned i; +   struct translate_key fetch, emit; + +   struct draw_vs_varient_generic *vsvg = CALLOC_STRUCT( draw_vs_varient_generic ); +   if (vsvg == NULL) +      return NULL; + +   vsvg->base.key = *key; +   vsvg->base.vs = vs; +   vsvg->base.set_input     = vsvg_set_input; +   vsvg->base.set_constants = vsvg_set_constants; +   vsvg->base.set_viewport  = vsvg_set_viewport; +   vsvg->base.run_elts      = vsvg_run_elts; +   vsvg->base.run_linear    = vsvg_run_linear; +   vsvg->base.destroy       = vsvg_destroy; + + + +   /* Build free-standing fetch and emit functions: +    */ +   fetch.nr_elements = key->nr_inputs; +   fetch.output_stride = 0; +   for (i = 0; i < key->nr_inputs; i++) { +      fetch.element[i].input_format = key->element[i].in.format; +      fetch.element[i].input_buffer = key->element[i].in.buffer; +      fetch.element[i].input_offset = key->element[i].in.offset; +      fetch.element[i].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; +      fetch.element[i].output_offset = fetch.output_stride; +      fetch.output_stride += 4 * sizeof(float); +   } + + +   emit.nr_elements = key->nr_outputs; +   emit.output_stride = key->output_stride; +   for (i = 0; i < key->nr_outputs; i++) { +      if (key->element[i].out.format != EMIT_1F_PSIZE) +      {       +         emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT; +         emit.element[i].input_buffer = 0; +         emit.element[i].input_offset = key->element[i].out.vs_output * 4 * sizeof(float); +         emit.element[i].output_format = draw_translate_vinfo_format(key->element[i].out.format); +         emit.element[i].output_offset = key->element[i].out.offset; +      } +      else { +         emit.element[i].input_format = PIPE_FORMAT_R32_FLOAT; +         emit.element[i].input_buffer = 1; +         emit.element[i].input_offset = 0; +         emit.element[i].output_format = PIPE_FORMAT_R32_FLOAT; +         emit.element[i].output_offset = key->element[i].out.offset; +      } +   } + +   vsvg->fetch = draw_vs_get_fetch( vs->draw, &fetch ); +   vsvg->emit = draw_vs_get_emit( vs->draw, &emit ); + +   return &vsvg->base; +} + + + + + diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 4e036d9032..2415b0156b 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -36,11 +36,8 @@  #define DUMP_SSE  0 -#if DUMP_SSE -static void -_print_reg( -   struct x86_reg reg ) +void x86_print_reg( struct x86_reg reg )  {     if (reg.mod != mod_REG)         debug_printf( "[" ); @@ -77,6 +74,7 @@ _print_reg(        debug_printf( "]" );  } +#if DUMP_SSE  #define DUMP_START() debug_printf( "\n" )  #define DUMP_END() debug_printf( "\n" ) @@ -87,7 +85,7 @@ _print_reg(        foo++;                                    \     if  (*foo)                                   \        foo++;                                    \ -   debug_printf( "\n% 15s ", foo );             \ +   debug_printf( "\n% 4x% 15s ", p->csr - p->store, foo );             \  } while (0)  #define DUMP_I( I ) do {                        \ @@ -97,27 +95,27 @@ _print_reg(  #define DUMP_R( R0 ) do {                       \     DUMP();                                      \ -   _print_reg( R0 );                            \ +   x86_print_reg( R0 );                            \  } while( 0 )  #define DUMP_RR( R0, R1 ) do {                  \     DUMP();                                      \ -   _print_reg( R0 );                            \ +   x86_print_reg( R0 );                            \     debug_printf( ", " );                        \ -   _print_reg( R1 );                            \ +   x86_print_reg( R1 );                            \  } while( 0 )  #define DUMP_RI( R0, I ) do {                   \     DUMP();                                      \ -   _print_reg( R0 );                            \ +   x86_print_reg( R0 );                            \     debug_printf( ", %u", I );                   \  } while( 0 )  #define DUMP_RRI( R0, R1, I ) do {              \     DUMP();                                      \ -   _print_reg( R0 );                            \ +   x86_print_reg( R0 );                            \     debug_printf( ", " );                        \ -   _print_reg( R1 );                            \ +   x86_print_reg( R1 );                            \     debug_printf( ", %u", I );                   \  } while( 0 ) @@ -220,6 +218,8 @@ static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1  /* Build a modRM byte + possible displacement.  No treatment of SIB   * indexing.  BZZT - no way to encode an absolute address. + * + * This is the "/r" field in the x86 manuals...   */  static void emit_modrm( struct x86_function *p,   			struct x86_reg reg,  @@ -258,7 +258,8 @@ static void emit_modrm( struct x86_function *p,     }  } - +/* Emits the "/0".."/7" specialized versions of the modrm ("/r") bytes. + */  static void emit_modrm_noreg( struct x86_function *p,  			      unsigned op,  			      struct x86_reg regmem ) @@ -367,8 +368,7 @@ void x86_jcc( struct x86_function *p,     DUMP_I(cc);     if (offset < 0) { -      int amt = p->csr - p->store; -      assert(amt > -offset); +      assert(p->csr - p->store > -offset);     }     if (offset <= 127 && offset >= -128) { @@ -445,6 +445,16 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )     emit_1i(p, imm);  } +void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm ) +{ +   DUMP_RI( dst, imm ); +   assert(dst.mod == mod_REG); +   emit_1ub(p, 0x80); +   emit_modrm_noreg(p, 0, dst); +   emit_1ub(p, imm); +} + +  void x86_push( struct x86_function *p,  	       struct x86_reg reg )  { @@ -461,6 +471,17 @@ void x86_push( struct x86_function *p,     p->stack_offset += 4;  } +void x86_push_imm32( struct x86_function *p, +                     int imm32 ) +{ +   DUMP_I( imm32 ); +   emit_1ub(p, 0x68); +   emit_1i(p,  imm32); + +   p->stack_offset += 4; +} + +  void x86_pop( struct x86_function *p,  	      struct x86_reg reg )  { @@ -988,6 +1009,24 @@ void sse2_movd( struct x86_function *p,  /***********************************************************************   * x87 instructions   */ +static void note_x87_pop( struct x86_function *p ) +{ +   p->x87_stack--; +   assert(p->x87_stack >= 0); +} + +static void note_x87_push( struct x86_function *p ) +{ +   p->x87_stack++; +   assert(p->x87_stack <= 7); +} + +void x87_assert_stack_empty( struct x86_function *p ) +{ +   assert (p->x87_stack == 0); +} + +  void x87_fist( struct x86_function *p, struct x86_reg dst )  {     DUMP_R( dst ); @@ -1000,6 +1039,7 @@ void x87_fistp( struct x86_function *p, struct x86_reg dst )     DUMP_R( dst );     emit_1ub(p, 0xdb);     emit_modrm_noreg(p, 3, dst); +   note_x87_pop(p);  }  void x87_fild( struct x86_function *p, struct x86_reg arg ) @@ -1007,12 +1047,14 @@ void x87_fild( struct x86_function *p, struct x86_reg arg )     DUMP_R( arg );     emit_1ub(p, 0xdf);     emit_modrm_noreg(p, 0, arg); +   note_x87_push(p);  }  void x87_fldz( struct x86_function *p )  {     DUMP();     emit_2ub(p, 0xd9, 0xee); +   note_x87_push(p);  } @@ -1029,18 +1071,21 @@ void x87_fld1( struct x86_function *p )  {     DUMP();     emit_2ub(p, 0xd9, 0xe8); +   note_x87_push(p);  }  void x87_fldl2e( struct x86_function *p )  {     DUMP();     emit_2ub(p, 0xd9, 0xea); +   note_x87_push(p);  }  void x87_fldln2( struct x86_function *p )  {     DUMP();     emit_2ub(p, 0xd9, 0xed); +   note_x87_push(p);  }  void x87_fwait( struct x86_function *p ) @@ -1061,6 +1106,49 @@ void x87_fclex( struct x86_function *p )     x87_fnclex(p);  } +void x87_fcmovb( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   assert(arg.file == file_x87); +   emit_2ub(p, 0xda, 0xc0+arg.idx); +} + +void x87_fcmove( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   assert(arg.file == file_x87); +   emit_2ub(p, 0xda, 0xc8+arg.idx); +} + +void x87_fcmovbe( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   assert(arg.file == file_x87); +   emit_2ub(p, 0xda, 0xd0+arg.idx); +} + +void x87_fcmovnb( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   assert(arg.file == file_x87); +   emit_2ub(p, 0xdb, 0xc0+arg.idx); +} + +void x87_fcmovne( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   assert(arg.file == file_x87); +   emit_2ub(p, 0xdb, 0xc8+arg.idx); +} + +void x87_fcmovnbe( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   assert(arg.file == file_x87); +   emit_2ub(p, 0xdb, 0xd0+arg.idx); +} + +  static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,  			  unsigned char dst0ub0, @@ -1148,6 +1236,7 @@ void x87_fmulp( struct x86_function *p, struct x86_reg dst )     assert(dst.file == file_x87);     assert(dst.idx >= 1);     emit_2ub(p, 0xde, 0xc8+dst.idx); +   note_x87_pop(p);  }  void x87_fsubp( struct x86_function *p, struct x86_reg dst ) @@ -1156,6 +1245,7 @@ void x87_fsubp( struct x86_function *p, struct x86_reg dst )     assert(dst.file == file_x87);     assert(dst.idx >= 1);     emit_2ub(p, 0xde, 0xe8+dst.idx); +   note_x87_pop(p);  }  void x87_fsubrp( struct x86_function *p, struct x86_reg dst ) @@ -1164,6 +1254,7 @@ void x87_fsubrp( struct x86_function *p, struct x86_reg dst )     assert(dst.file == file_x87);     assert(dst.idx >= 1);     emit_2ub(p, 0xde, 0xe0+dst.idx); +   note_x87_pop(p);  }  void x87_faddp( struct x86_function *p, struct x86_reg dst ) @@ -1172,6 +1263,7 @@ void x87_faddp( struct x86_function *p, struct x86_reg dst )     assert(dst.file == file_x87);     assert(dst.idx >= 1);     emit_2ub(p, 0xde, 0xc0+dst.idx); +   note_x87_pop(p);  }  void x87_fdivp( struct x86_function *p, struct x86_reg dst ) @@ -1180,6 +1272,7 @@ void x87_fdivp( struct x86_function *p, struct x86_reg dst )     assert(dst.file == file_x87);     assert(dst.idx >= 1);     emit_2ub(p, 0xde, 0xf8+dst.idx); +   note_x87_pop(p);  }  void x87_fdivrp( struct x86_function *p, struct x86_reg dst ) @@ -1188,6 +1281,13 @@ void x87_fdivrp( struct x86_function *p, struct x86_reg dst )     assert(dst.file == file_x87);     assert(dst.idx >= 1);     emit_2ub(p, 0xde, 0xf0+dst.idx); +   note_x87_pop(p); +} + +void x87_ftst( struct x86_function *p ) +{ +   DUMP(); +   emit_2ub(p, 0xd9, 0xe4);  }  void x87_fucom( struct x86_function *p, struct x86_reg arg ) @@ -1202,12 +1302,15 @@ void x87_fucomp( struct x86_function *p, struct x86_reg arg )     DUMP_R( arg );     assert(arg.file == file_x87);     emit_2ub(p, 0xdd, 0xe8+arg.idx); +   note_x87_pop(p);  }  void x87_fucompp( struct x86_function *p )  {     DUMP();     emit_2ub(p, 0xda, 0xe9); +   note_x87_pop(p);             /* pop twice */ +   note_x87_pop(p);             /* pop twice */  }  void x87_fxch( struct x86_function *p, struct x86_reg arg ) @@ -1289,6 +1392,7 @@ void x87_fyl2x( struct x86_function *p )  {     DUMP();     emit_2ub(p, 0xd9, 0xf1); +   note_x87_pop(p);  }  /* st1 = st1 * log2(st0 + 1.0); @@ -1300,6 +1404,7 @@ void x87_fyl2xp1( struct x86_function *p )  {     DUMP();     emit_2ub(p, 0xd9, 0xf9); +   note_x87_pop(p);  } @@ -1312,6 +1417,7 @@ void x87_fld( struct x86_function *p, struct x86_reg arg )        emit_1ub(p, 0xd9);        emit_modrm_noreg(p, 0, arg);     } +   note_x87_push(p);  }  void x87_fst( struct x86_function *p, struct x86_reg dst ) @@ -1334,8 +1440,15 @@ void x87_fstp( struct x86_function *p, struct x86_reg dst )        emit_1ub(p, 0xd9);        emit_modrm_noreg(p, 3, dst);     } +   note_x87_pop(p);  } +void x87_fpop( struct x86_function *p ) +{ +   x87_fstp( p, x86_make_reg( file_x87, 0 )); +} + +  void x87_fcom( struct x86_function *p, struct x86_reg dst )  {     DUMP_R( dst ); @@ -1347,6 +1460,7 @@ void x87_fcom( struct x86_function *p, struct x86_reg dst )     }  } +  void x87_fcomp( struct x86_function *p, struct x86_reg dst )  {     DUMP_R( dst ); @@ -1356,6 +1470,20 @@ void x87_fcomp( struct x86_function *p, struct x86_reg dst )        emit_1ub(p, 0xd8);        emit_modrm_noreg(p, 3, dst);     } +   note_x87_pop(p); +} + +void x87_fcomi( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   emit_2ub(p, 0xdb, 0xf0+arg.idx); +} + +void x87_fcomip( struct x86_function *p, struct x86_reg arg ) +{ +   DUMP_R( arg ); +   emit_2ub(p, 0xdb, 0xf0+arg.idx); +   note_x87_pop(p);  } @@ -1374,6 +1502,17 @@ void x87_fnstsw( struct x86_function *p, struct x86_reg dst )  } +void x87_fnstcw( struct x86_function *p, struct x86_reg dst ) +{ +   DUMP_R( dst ); +   assert(dst.file == file_REG32); + +   emit_1ub(p, 0x9b);           /* WAIT -- needed? */ +   emit_1ub(p, 0xd9); +   emit_modrm_noreg(p, 7, dst); +} + +  /*********************************************************************** @@ -1442,6 +1581,21 @@ void mmx_movq( struct x86_function *p,   */ +void x86_cdecl_caller_push_regs( struct x86_function *p ) +{ +   x86_push(p, x86_make_reg(file_REG32, reg_AX)); +   x86_push(p, x86_make_reg(file_REG32, reg_CX)); +   x86_push(p, x86_make_reg(file_REG32, reg_DX)); +} + +void x86_cdecl_caller_pop_regs( struct x86_function *p ) +{ +   x86_pop(p, x86_make_reg(file_REG32, reg_DX)); +   x86_pop(p, x86_make_reg(file_REG32, reg_CX)); +   x86_pop(p, x86_make_reg(file_REG32, reg_AX)); +} + +  /* Retreive a reference to one of the function arguments, taking into   * account any push/pop activity:   */ diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h index eacaeeaf6f..63e812fac9 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h @@ -41,10 +41,12 @@ struct x86_function {     unsigned size;     unsigned char *store;     unsigned char *csr; -   unsigned stack_offset; -   int need_emms; + +   unsigned stack_offset:16; +   unsigned need_emms:8; +   int x87_stack:8; +     unsigned char error_overflow[4]; -   const char *fn;  };  enum x86_reg_file { @@ -107,6 +109,9 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size );  void x86_release_func( struct x86_function *p );  void (*x86_get_func( struct x86_function *p ))( void ); +/* Debugging: + */ +void x86_print_reg( struct x86_reg reg );  /* Create and manipulate registers and regmem values: @@ -150,6 +155,7 @@ void x86_call( struct x86_function *p, struct x86_reg reg);   * I load the immediate into general purpose register and use it.   */  void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ); +void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );  /* Macro for sse_shufps() and sse2_pshufd(): @@ -220,6 +226,7 @@ void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );  void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );  void x86_pop( struct x86_function *p, struct x86_reg reg );  void x86_push( struct x86_function *p, struct x86_reg reg ); +void x86_push_imm32( struct x86_function *p, int imm );  void x86_ret( struct x86_function *p );  void x86_retw( struct x86_function *p, unsigned short imm );  void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -227,13 +234,27 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );  void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );  void x86_sahf( struct x86_function *p ); + +void x86_cdecl_caller_push_regs( struct x86_function *p ); +void x86_cdecl_caller_pop_regs( struct x86_function *p ); + +void x87_assert_stack_empty( struct x86_function *p ); +  void x87_f2xm1( struct x86_function *p );  void x87_fabs( struct x86_function *p );  void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );  void x87_faddp( struct x86_function *p, struct x86_reg dst );  void x87_fchs( struct x86_function *p );  void x87_fclex( struct x86_function *p ); +void x87_fcmovb( struct x86_function *p, struct x86_reg src ); +void x87_fcmovbe( struct x86_function *p, struct x86_reg src ); +void x87_fcmove( struct x86_function *p, struct x86_reg src ); +void x87_fcmovnb( struct x86_function *p, struct x86_reg src ); +void x87_fcmovnbe( struct x86_function *p, struct x86_reg src ); +void x87_fcmovne( struct x86_function *p, struct x86_reg src );  void x87_fcom( struct x86_function *p, struct x86_reg dst ); +void x87_fcomi( struct x86_function *p, struct x86_reg dst ); +void x87_fcomip( struct x86_function *p, struct x86_reg dst );  void x87_fcomp( struct x86_function *p, struct x86_reg dst );  void x87_fcos( struct x86_function *p );  void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ); @@ -253,6 +274,7 @@ void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );  void x87_fmulp( struct x86_function *p, struct x86_reg dst );  void x87_fnclex( struct x86_function *p );  void x87_fprndint( struct x86_function *p ); +void x87_fpop( struct x86_function *p );  void x87_fscale( struct x86_function *p );  void x87_fsin( struct x86_function *p );  void x87_fsincos( struct x86_function *p ); @@ -263,11 +285,13 @@ void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );  void x87_fsubp( struct x86_function *p, struct x86_reg dst );  void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );  void x87_fsubrp( struct x86_function *p, struct x86_reg dst ); +void x87_ftst( struct x86_function *p );  void x87_fxch( struct x86_function *p, struct x86_reg dst );  void x87_fxtract( struct x86_function *p );  void x87_fyl2x( struct x86_function *p );  void x87_fyl2xp1( struct x86_function *p );  void x87_fwait( struct x86_function *p ); +void x87_fnstcw( struct x86_function *p, struct x86_reg dst );  void x87_fnstsw( struct x86_function *p, struct x86_reg dst );  void x87_fucompp( struct x86_function *p );  void x87_fucomp( struct x86_function *p, struct x86_reg arg ); diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c index 4c65ffd780..648afa2a51 100644 --- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c @@ -539,9 +539,9 @@ static const char *TGSI_MODULATES[] =     "MODULATE_EIGHTH"  }; -static void -dump_declaration_short( -   struct tgsi_full_declaration  *decl ) +void +tgsi_dump_declaration( +   const struct tgsi_full_declaration  *decl )  {     TXT( "\nDCL " );     ENM( decl->Declaration.File, TGSI_FILES_SHORT ); @@ -672,9 +672,9 @@ dump_declaration_verbose(     }  } -static void -dump_immediate_short( -   struct tgsi_full_immediate *imm ) +void +tgsi_dump_immediate( +   const struct tgsi_full_immediate *imm )  {     unsigned i; @@ -727,9 +727,9 @@ dump_immediate_verbose(     }  } -static void -dump_instruction_short( -   struct tgsi_full_instruction  *inst, +void +tgsi_dump_instruction( +   const struct tgsi_full_instruction  *inst,     unsigned                      instno )  {     unsigned i; @@ -1281,17 +1281,17 @@ tgsi_dump(        switch( parse.FullToken.Token.Type ) {        case TGSI_TOKEN_TYPE_DECLARATION: -         dump_declaration_short( +         tgsi_dump_declaration(              &parse.FullToken.FullDeclaration );           break;        case TGSI_TOKEN_TYPE_IMMEDIATE: -         dump_immediate_short( +         tgsi_dump_immediate(              &parse.FullToken.FullImmediate );           break;        case TGSI_TOKEN_TYPE_INSTRUCTION: -         dump_instruction_short( +         tgsi_dump_instruction(              &parse.FullToken.FullInstruction,              instno );           instno++; diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h index beb0155d56..ca83bdef20 100644 --- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h +++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h @@ -14,6 +14,24 @@ tgsi_dump(     const struct tgsi_token *tokens,     unsigned                flags ); +struct tgsi_full_immediate; +struct tgsi_full_instruction; +struct tgsi_full_declaration; + +void +tgsi_dump_immediate( +   const struct tgsi_full_immediate *imm ); + +void +tgsi_dump_instruction( +   const struct tgsi_full_instruction  *inst, +   unsigned                      instno ); + +void +tgsi_dump_declaration( +   const struct tgsi_full_declaration  *decl ); + +  #if defined __cplusplus  }  #endif diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h index b8210af50c..c3b754a902 100644 --- a/src/gallium/auxiliary/translate/translate.h +++ b/src/gallium/auxiliary/translate/translate.h @@ -71,15 +71,15 @@ struct translate {  		       const void *ptr,  		       unsigned stride ); -   void (*run_elts)( struct translate *, -		     const unsigned *elts, -		     unsigned count, -		     void *output_buffer); +   void (PIPE_CDECL *run_elts)( struct translate *, +                                const unsigned *elts, +                                unsigned count, +                                void *output_buffer); -   void (*run)( struct translate *, -		unsigned start, -		unsigned count, -		void *output_buffer); +   void (PIPE_CDECL *run)( struct translate *, +                           unsigned start, +                           unsigned count, +                           void *output_buffer);  }; diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index 402780ee53..a25d94f2ca 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -541,10 +541,10 @@ static emit_func get_emit_func( enum pipe_format format )  /**   * Fetch vertex attributes for 'count' vertices.   */ -static void generic_run_elts( struct translate *translate, -			      const unsigned *elts, -			      unsigned count, -			      void *output_buffer ) +static void PIPE_CDECL generic_run_elts( struct translate *translate, +                                         const unsigned *elts, +                                         unsigned count, +                                         void *output_buffer )  {     struct translate_generic *tg = translate_generic(translate);     char *vert = output_buffer; @@ -580,10 +580,10 @@ static void generic_run_elts( struct translate *translate, -static void generic_run( struct translate *translate, -			 unsigned start, -			 unsigned count, -			 void *output_buffer ) +static void PIPE_CDECL generic_run( struct translate *translate, +                                    unsigned start, +                                    unsigned count, +                                    void *output_buffer )  {     struct translate_generic *tg = translate_generic(translate);     char *vert = output_buffer; diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index a54ac5a82f..2fc8b9d3d0 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -45,22 +45,16 @@  #define W    3 -#ifdef WIN32 -#define RTASM __cdecl -#else -#define RTASM -#endif - -typedef void (RTASM *run_func)( struct translate *translate, -                                unsigned start, -                                unsigned count, -                                void *output_buffer ); - -typedef void (RTASM *run_elts_func)( struct translate *translate, -                                     const unsigned *elts, +typedef void (PIPE_CDECL *run_func)( struct translate *translate, +                                     unsigned start,                                       unsigned count,                                       void *output_buffer ); +typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, +                                          const unsigned *elts, +                                          unsigned count, +                                          void *output_buffer ); +  struct translate_sse { @@ -472,13 +466,7 @@ static boolean build_vertex_emit( struct translate_sse *p,     x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));     /* Incr index -    */   /* Emit code for each of the attributes.  Currently routes -    * everything through SSE registers, even when it might be more -    * efficient to stick with regular old x86.  No optimization or -    * other tricks - enough new ground to cover here just getting -    * things working. -    */ - +    */      if (linear) {        x86_inc(p->func, idxEBX);     }   | 
