diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_jit.c | 16 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_jit.h | 26 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_perf.c | 39 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_perf.h | 1 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast.c | 126 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast.h | 82 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_priv.h | 63 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_tri.c | 179 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h | 238 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_setup.c | 24 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_setup_context.h | 1 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_setup_tri.c | 603 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_state_fs.c | 286 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_state_fs.h | 1 | 
14 files changed, 898 insertions, 787 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index 23aa34ddec..8e6dfb293d 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -103,10 +103,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)        elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatType();        elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] = LLVMInt32Type();        elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32Type(); -      elem_types[LP_JIT_CTX_SCISSOR_XMIN] = LLVMFloatType(); -      elem_types[LP_JIT_CTX_SCISSOR_YMIN] = LLVMFloatType(); -      elem_types[LP_JIT_CTX_SCISSOR_XMAX] = LLVMFloatType(); -      elem_types[LP_JIT_CTX_SCISSOR_YMAX] = LLVMFloatType();        elem_types[LP_JIT_CTX_BLEND_COLOR] = LLVMPointerType(LLVMInt8Type(), 0);        elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type,                                                        PIPE_MAX_SAMPLERS); @@ -125,18 +121,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)        LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back,                               screen->target, context_type,                               LP_JIT_CTX_STENCIL_REF_BACK); -      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmin, -                             screen->target, context_type, -                             LP_JIT_CTX_SCISSOR_XMIN); -      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymin, -                             screen->target, context_type, -                             LP_JIT_CTX_SCISSOR_YMIN); -      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmax, -                             screen->target, context_type, -                             LP_JIT_CTX_SCISSOR_XMAX); -      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymax, -                             screen->target, context_type, -                             LP_JIT_CTX_SCISSOR_YMAX);        LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,                               screen->target, context_type,                               LP_JIT_CTX_BLEND_COLOR); diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 8d06e65725..c94189413a 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -89,9 +89,6 @@ struct lp_jit_context     uint32_t stencil_ref_front, stencil_ref_back; -   /** floats, not ints */ -   float scissor_xmin, scissor_ymin, scissor_xmax, scissor_ymax; -     /* FIXME: store (also?) in floats */     uint8_t *blend_color; @@ -108,10 +105,6 @@ enum {     LP_JIT_CTX_ALPHA_REF,     LP_JIT_CTX_STENCIL_REF_FRONT,     LP_JIT_CTX_STENCIL_REF_BACK, -   LP_JIT_CTX_SCISSOR_XMIN, -   LP_JIT_CTX_SCISSOR_YMIN, -   LP_JIT_CTX_SCISSOR_XMAX, -   LP_JIT_CTX_SCISSOR_YMAX,     LP_JIT_CTX_BLEND_COLOR,     LP_JIT_CTX_TEXTURES,     LP_JIT_CTX_COUNT @@ -130,18 +123,6 @@ enum {  #define lp_jit_context_stencil_ref_back_value(_builder, _ptr) \     lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back") -#define lp_jit_context_scissor_xmin_value(_builder, _ptr) \ -   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMIN, "scissor_xmin") - -#define lp_jit_context_scissor_ymin_value(_builder, _ptr) \ -   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMIN, "scissor_ymin") - -#define lp_jit_context_scissor_xmax_value(_builder, _ptr) \ -   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMAX, "scissor_xmax") - -#define lp_jit_context_scissor_ymax_value(_builder, _ptr) \ -   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMAX, "scissor_ymax") -  #define lp_jit_context_blend_color(_builder, _ptr) \     lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_BLEND_COLOR, "blend_color") @@ -160,12 +141,7 @@ typedef void                      const void *dady,                      uint8_t **color,                      void *depth, -                    const int32_t c1, -                    const int32_t c2, -                    const int32_t c3, -                    const int32_t *step1, -                    const int32_t *step2, -                    const int32_t *step3, +                    uint32_t mask,                      uint32_t *counter); diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c index a316597675..083e7e30a5 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.c +++ b/src/gallium/drivers/llvmpipe/lp_perf.c @@ -46,10 +46,10 @@ lp_print_counters(void)  {     if (LP_DEBUG & DEBUG_COUNTERS) {        unsigned total_64, total_16, total_4; -      float p1, p2, p3; +      float p1, p2, p3, p4; -      debug_printf("llvmpipe: nr_triangles:               %9u\n", lp_count.nr_tris); -      debug_printf("llvmpipe: nr_culled_triangles:        %9u\n", lp_count.nr_culled_tris); +      debug_printf("llvmpipe: nr_triangles:                 %9u\n", lp_count.nr_tris); +      debug_printf("llvmpipe: nr_culled_triangles:          %9u\n", lp_count.nr_culled_tris);        total_64 = (lp_count.nr_empty_64 +                     lp_count.nr_fully_covered_64 + @@ -58,10 +58,13 @@ lp_print_counters(void)        p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64;        p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64;        p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64; +      p4 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64; -      debug_printf("llvmpipe: nr_empty_64x64:             %9u (%2.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64); -      debug_printf("llvmpipe: nr_fully_covered_64x64:     %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64); -      debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64); +      debug_printf("llvmpipe: nr_64x64:                     %9u\n", total_64); +      debug_printf("llvmpipe:   nr_fully_covered_64x64:     %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64); +      debug_printf("llvmpipe:     nr_shade_opaque_64x64:    %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p4, total_64); +      debug_printf("llvmpipe:   nr_partially_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64); +      debug_printf("llvmpipe:   nr_empty_64x64:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64);        total_16 = (lp_count.nr_empty_16 +                     lp_count.nr_fully_covered_16 + @@ -71,25 +74,27 @@ lp_print_counters(void)        p2 = 100.0 * (float) lp_count.nr_fully_covered_16 / (float) total_16;        p3 = 100.0 * (float) lp_count.nr_partially_covered_16 / (float) total_16; -      debug_printf("llvmpipe: nr_empty_16x16:             %9u (%2.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16); -      debug_printf("llvmpipe: nr_fully_covered_16x16:     %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16); -      debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16); +      debug_printf("llvmpipe: nr_16x16:                     %9u\n", total_16); +      debug_printf("llvmpipe:   nr_fully_covered_16x16:     %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16); +      debug_printf("llvmpipe:   nr_partially_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16); +      debug_printf("llvmpipe:   nr_empty_16x16:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16);        total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4);        p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4;        p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4; -      debug_printf("llvmpipe: nr_empty_4x4:               %9u (%2.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4); -      debug_printf("llvmpipe: nr_non_empty_4x4:           %9u (%2.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4); +      debug_printf("llvmpipe: nr_4x4:                       %9u\n", total_4); +      debug_printf("llvmpipe:   nr_empty_4x4:               %9u (%3.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4); +      debug_printf("llvmpipe:   nr_non_empty_4x4:           %9u (%3.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4); -      debug_printf("llvmpipe: nr_color_tile_clear:        %9u\n", lp_count.nr_color_tile_clear); -      debug_printf("llvmpipe: nr_color_tile_load:         %9u\n", lp_count.nr_color_tile_load); -      debug_printf("llvmpipe: nr_color_tile_store:        %9u\n", lp_count.nr_color_tile_store); +      debug_printf("llvmpipe: nr_color_tile_clear:          %9u\n", lp_count.nr_color_tile_clear); +      debug_printf("llvmpipe: nr_color_tile_load:           %9u\n", lp_count.nr_color_tile_load); +      debug_printf("llvmpipe: nr_color_tile_store:          %9u\n", lp_count.nr_color_tile_store); -      debug_printf("llvmpipe: nr_llvm_compiles:           %u\n", lp_count.nr_llvm_compiles); -      debug_printf("llvmpipe: total LLVM compile time:    %.2f sec\n", lp_count.llvm_compile_time / 1000000.0); -      debug_printf("llvmpipe: average LLVM compile time:  %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles); +      debug_printf("llvmpipe: nr_llvm_compiles:             %u\n", lp_count.nr_llvm_compiles); +      debug_printf("llvmpipe: total LLVM compile time:      %.2f sec\n", lp_count.llvm_compile_time / 1000000.0); +      debug_printf("llvmpipe: average LLVM compile time:    %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles);     }  } diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h index a9629dae3c..4774f64550 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.h +++ b/src/gallium/drivers/llvmpipe/lp_perf.h @@ -44,6 +44,7 @@ struct lp_counters     unsigned nr_empty_64;     unsigned nr_fully_covered_64;     unsigned nr_partially_covered_64; +   unsigned nr_shade_opaque_64;     unsigned nr_empty_16;     unsigned nr_fully_covered_16;     unsigned nr_partially_covered_16; diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index 1dde327836..0130e39fd8 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -28,6 +28,7 @@  #include <limits.h>  #include "util/u_memory.h"  #include "util/u_math.h" +#include "util/u_rect.h"  #include "util/u_surface.h"  #include "lp_scene_queue.h" @@ -136,7 +137,6 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,     struct lp_rasterizer *rast = task->rast;     struct lp_scene *scene = rast->curr_scene;     enum lp_texture_usage usage; -   unsigned buf;     LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y); @@ -146,24 +146,8 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,     task->x = x;     task->y = y; -   if (scene->has_color_clear) -      usage = LP_TEX_USAGE_WRITE_ALL; -   else -      usage = LP_TEX_USAGE_READ_WRITE; - -   /* get pointers to color tile(s) */ -   for (buf = 0; buf < rast->state.nr_cbufs; buf++) { -      struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf]; -      struct llvmpipe_resource *lpt; -      assert(cbuf); -      lpt = llvmpipe_resource(cbuf->texture); -      task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt, -                                                         cbuf->face + cbuf->zslice, -                                                         cbuf->level, -                                                         usage, -                                                         x, y); -      assert(task->color_tiles[buf]); -   } +   /* reset pointers to color tile(s) */ +   memset(task->color_tiles, 0, sizeof(task->color_tiles));     /* get pointer to depth/stencil tile */     { @@ -222,7 +206,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,         clear_color[2] == clear_color[3]) {        /* clear to grayscale value {x, x, x, x} */        for (i = 0; i < rast->state.nr_cbufs; i++) { -         uint8_t *ptr = task->color_tiles[i]; +         uint8_t *ptr = +            lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);  	 memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4);        }     } @@ -234,7 +219,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,         */        const unsigned chunk = TILE_SIZE / 4;        for (i = 0; i < rast->state.nr_cbufs; i++) { -         uint8_t *c = task->color_tiles[i]; +         uint8_t *c = +            lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);           unsigned j;           for (j = 0; j < 4 * TILE_SIZE; j++) { @@ -378,8 +364,8 @@ lp_rast_load_color(struct lp_rasterizer_task *task,   * This is a bin command which is stored in all bins.   */  void -lp_rast_store_color( struct lp_rasterizer_task *task, -                     const union lp_rast_cmd_arg arg) +lp_rast_store_linear_color( struct lp_rasterizer_task *task, +                            const union lp_rast_cmd_arg arg)  {     struct lp_rasterizer *rast = task->rast;     struct lp_scene *scene = rast->curr_scene; @@ -448,30 +434,54 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,           /* run shader on 4x4 block */           variant->jit_function[RAST_WHOLE]( &state->jit_context, -                                          tile_x + x, tile_y + y, -                                          inputs->facing, -                                          inputs->a0, -                                          inputs->dadx, -                                          inputs->dady, -                                          color, -                                          depth, -                                          INT_MIN, INT_MIN, INT_MIN, -                                          NULL, NULL, NULL, &task->vis_counter); +                                            tile_x + x, tile_y + y, +                                            inputs->facing, +                                            inputs->a0, +                                            inputs->dadx, +                                            inputs->dady, +                                            color, +                                            depth, +                                            0xffff, +                                            &task->vis_counter);        }     }  }  /** - * Compute shading for a 4x4 block of pixels. + * Run the shader on all blocks in a tile.  This is used when a tile is + * completely contained inside a triangle, and the shader is opaque. + * This is a bin command called during bin processing. + */ +void +lp_rast_shade_tile_opaque(struct lp_rasterizer_task *task, +                          const union lp_rast_cmd_arg arg) +{ +   struct lp_rasterizer *rast = task->rast; +   unsigned i; + +   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); + +   /* this will prevent converting the layout from tiled to linear */ +   for (i = 0; i < rast->state.nr_cbufs; i++) { +      (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); +   } + +   lp_rast_shade_tile(task, arg); +} + + +/** + * Compute shading for a 4x4 block of pixels inside a triangle.   * This is a bin command called during bin processing.   * \param x  X position of quad in window coords   * \param y  Y position of quad in window coords   */ -void lp_rast_shade_quads( struct lp_rasterizer_task *task, -                          const struct lp_rast_shader_inputs *inputs, -                          unsigned x, unsigned y, -                          int32_t c1, int32_t c2, int32_t c3) +void +lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, +                         const struct lp_rast_shader_inputs *inputs, +                         unsigned x, unsigned y, +                         unsigned mask)  {     const struct lp_rast_state *state = task->current_state;     struct lp_fragment_shader_variant *variant = state->variant; @@ -501,27 +511,21 @@ void lp_rast_shade_quads( struct lp_rasterizer_task *task,     assert(lp_check_alignment(state->jit_context.blend_color, 16)); -   assert(lp_check_alignment(inputs->step[0], 16)); -   assert(lp_check_alignment(inputs->step[1], 16)); -   assert(lp_check_alignment(inputs->step[2], 16)); -     /* run shader on 4x4 block */ -   variant->jit_function[RAST_EDGE_TEST]( &state->jit_context, -                                        x, y, -                                        inputs->facing, -                                        inputs->a0, -                                        inputs->dadx, -                                        inputs->dady, -                                        color, -                                        depth, -                                        c1, c2, c3, -                                        inputs->step[0], -                                        inputs->step[1], -                                        inputs->step[2], -					&task->vis_counter); +   variant->jit_function[RAST_EDGE_TEST](&state->jit_context, +                                         x, y, +                                         inputs->facing, +                                         inputs->a0, +                                         inputs->dadx, +                                         inputs->dady, +                                         color, +                                         depth, +                                         mask, +                                         &task->vis_counter);  } +  /**   * Set top row and left column of the tile's pixels to white.  For debugging.   */ @@ -717,10 +721,17 @@ static struct {  {     RAST(clear_color),     RAST(clear_zstencil), -   RAST(triangle), +   RAST(triangle_1), +   RAST(triangle_2), +   RAST(triangle_3), +   RAST(triangle_4), +   RAST(triangle_5), +   RAST(triangle_6), +   RAST(triangle_7),     RAST(shade_tile), +   RAST(shade_tile_opaque),     RAST(set_state), -   RAST(store_color), +   RAST(store_linear_color),     RAST(fence),     RAST(begin_query),     RAST(end_query), @@ -775,7 +786,8 @@ is_empty_bin( const struct cmd_bin *bin )     }     for (i = 0; i < head->count; i++) -      if (head->cmd[i] != lp_rast_set_state) { +      if (head->cmd[i] != lp_rast_set_state && +          head->cmd[i] != lp_rast_store_linear_color) {           return FALSE;        } diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index 80ca68f5a2..ae73e6d8c9 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -83,9 +83,6 @@ struct lp_rast_shader_inputs {     float (*a0)[4];     float (*dadx)[4];     float (*dady)[4]; - -   /* edge/step info for 3 edges and 4x4 block of pixels */ -   PIPE_ALIGN_VAR(16) int step[3][16];  };  struct lp_rast_clearzs { @@ -93,6 +90,22 @@ struct lp_rast_clearzs {     unsigned clearzs_mask;  }; +struct lp_rast_plane { +   /* one-pixel sized trivial accept offsets for each plane */ +   int ei; + +   /* one-pixel sized trivial reject offsets for each plane */ +   int eo; + +   /* edge function values at minx,miny ?? */ +   int c; + +   int dcdx; +   int dcdy; +    +   /* edge/step info for 3 edges and 4x4 block of pixels */ +   const int *step; +};  /**   * Rasterization information for a triangle known to be in this bin, @@ -101,35 +114,16 @@ struct lp_rast_clearzs {   * Objects of this type are put into the lp_setup_context::data buffer.   */  struct lp_rast_triangle { +   /* inputs for the shader */ +   PIPE_ALIGN_VAR(16) struct lp_rast_shader_inputs inputs; + +   int step[3][16]; +  #ifdef DEBUG     float v[3][2];  #endif -   /* one-pixel sized trivial accept offsets for each plane */ -   int ei1;                    -   int ei2; -   int ei3; - -   /* one-pixel sized trivial reject offsets for each plane */ -   int eo1;                    -   int eo2; -   int eo3; - -   /* y deltas for vertex pairs (in fixed pt) */ -   int dy12; -   int dy23; -   int dy31; - -   /* x deltas for vertex pairs (in fixed pt) */ -   int dx12; -   int dx23; -   int dx31; - -   /* edge function values at minx,miny ?? */ -   int c1, c2, c3; - -   /* inputs for the shader */ -   PIPE_ALIGN_VAR(16) struct lp_rast_shader_inputs inputs; +   struct lp_rast_plane plane[7]; /* NOTE: may allocate fewer planes */  }; @@ -153,7 +147,10 @@ lp_rast_finish( struct lp_rasterizer *rast );  union lp_rast_cmd_arg {     const struct lp_rast_shader_inputs *shade_tile; -   const struct lp_rast_triangle *triangle; +   struct { +      const struct lp_rast_triangle *tri; +      unsigned plane_mask; +   } triangle;     const struct lp_rast_state *set_state;     uint8_t clear_color[4];     const struct lp_rast_clearzs *clear_zstencil; @@ -173,10 +170,12 @@ lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )  }  static INLINE union lp_rast_cmd_arg -lp_rast_arg_triangle( const struct lp_rast_triangle *triangle ) +lp_rast_arg_triangle( const struct lp_rast_triangle *triangle, +                      unsigned plane_mask)  {     union lp_rast_cmd_arg arg; -   arg.triangle = triangle; +   arg.triangle.tri = triangle; +   arg.triangle.plane_mask = plane_mask;     return arg;  } @@ -229,16 +228,31 @@ void lp_rast_clear_zstencil( struct lp_rasterizer_task *,  void lp_rast_set_state( struct lp_rasterizer_task *,                           const union lp_rast_cmd_arg ); -void lp_rast_triangle( struct lp_rasterizer_task *,  -                       const union lp_rast_cmd_arg ); +void lp_rast_triangle_1( struct lp_rasterizer_task *,  +                         const union lp_rast_cmd_arg ); +void lp_rast_triangle_2( struct lp_rasterizer_task *,  +                         const union lp_rast_cmd_arg ); +void lp_rast_triangle_3( struct lp_rasterizer_task *,  +                         const union lp_rast_cmd_arg ); +void lp_rast_triangle_4( struct lp_rasterizer_task *,  +                         const union lp_rast_cmd_arg ); +void lp_rast_triangle_5( struct lp_rasterizer_task *,  +                         const union lp_rast_cmd_arg ); +void lp_rast_triangle_6( struct lp_rasterizer_task *,  +                         const union lp_rast_cmd_arg ); +void lp_rast_triangle_7( struct lp_rasterizer_task *,  +                         const union lp_rast_cmd_arg );  void lp_rast_shade_tile( struct lp_rasterizer_task *,                           const union lp_rast_cmd_arg ); +void lp_rast_shade_tile_opaque( struct lp_rasterizer_task *, +                                const union lp_rast_cmd_arg ); +  void lp_rast_fence( struct lp_rasterizer_task *,                      const union lp_rast_cmd_arg ); -void lp_rast_store_color( struct lp_rasterizer_task *, +void lp_rast_store_linear_color( struct lp_rasterizer_task *,                            const union lp_rast_cmd_arg ); diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index eb4175dfa6..024a28be59 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -119,10 +119,12 @@ struct lp_rasterizer  }; -void lp_rast_shade_quads( struct lp_rasterizer_task *task, -                          const struct lp_rast_shader_inputs *inputs, -                          unsigned x, unsigned y, -                          int32_t c1, int32_t c2, int32_t c3); +void +lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, +                         const struct lp_rast_shader_inputs *inputs, +                         unsigned x, unsigned y, +                         unsigned mask); +  /** @@ -158,6 +160,40 @@ lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,  /** + * Get pointer to the swizzled color tile + */ +static INLINE uint8_t * +lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task, +                               unsigned buf, enum lp_texture_usage usage) +{ +   struct lp_rasterizer *rast = task->rast; + +   assert(task->x % TILE_SIZE == 0); +   assert(task->y % TILE_SIZE == 0); +   assert(buf < rast->state.nr_cbufs); + +   if (!task->color_tiles[buf]) { +      struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf]; +      struct llvmpipe_resource *lpt; +      assert(cbuf); +      lpt = llvmpipe_resource(cbuf->texture); +      task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt, +                                                         cbuf->face + cbuf->zslice, +                                                         cbuf->level, +                                                         usage, +                                                         task->x, +                                                         task->y); +      if (!task->color_tiles[buf]) { +         /* out of memory - use dummy tile memory */ +         return lp_get_dummy_tile(); +      } +   } + +   return task->color_tiles[buf]; +} + + +/**   * Get the pointer to a 4x4 color block (within a 64x64 tile).   * We'll map the color buffer on demand here.   * Note that this may be called even when there's no color buffers - return @@ -174,6 +210,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,     assert((x % TILE_VECTOR_WIDTH) == 0);     assert((y % TILE_VECTOR_HEIGHT) == 0); +   color = lp_rast_get_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE);     color = task->color_tiles[buf];     if (!color) {        /* out of memory - use dummy tile memory */ @@ -217,15 +254,15 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,     /* run shader on 4x4 block */     variant->jit_function[RAST_WHOLE]( &state->jit_context, -                                    x, y, -                                    inputs->facing, -                                    inputs->a0, -                                    inputs->dadx, -                                    inputs->dady, -                                    color, -                                    depth, -                                    INT_MIN, INT_MIN, INT_MIN, -                                    NULL, NULL, NULL, &task->vis_counter ); +                                      x, y, +                                      inputs->facing, +                                      inputs->a0, +                                      inputs->dadx, +                                      inputs->dady, +                                      color, +                                      depth, +                                      0xffff, +                                      &task->vis_counter );  } diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index a5f0d14c95..ebe9a8e92b 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -113,168 +113,31 @@ block_full_16(struct lp_rasterizer_task *task,  	 block_full_4(task, tri, x + ix, y + iy);  } +#define TAG(x) x##_1 +#define NR_PLANES 1 +#include "lp_rast_tri_tmp.h" -/** - * Pass the 4x4 pixel block to the shader function. - * Determination of which of the 16 pixels lies inside the triangle - * will be done as part of the fragment shader. - */ -static void -do_block_4(struct lp_rasterizer_task *task, -           const struct lp_rast_triangle *tri, -           int x, int y, -           int c1, int c2, int c3) -{ -   assert(x >= 0); -   assert(y >= 0); - -   lp_rast_shade_quads(task, &tri->inputs, x, y, -c1, -c2, -c3); -} - - -/** - * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out - * of the triangle's bounds. - */ -static void -do_block_16(struct lp_rasterizer_task *task, -            const struct lp_rast_triangle *tri, -            int x, int y, -            int c0, int c1, int c2) -{ -   unsigned mask = 0; -   int eo[3]; -   int c[3]; -   int i, j; - -   assert(x >= 0); -   assert(y >= 0); -   assert(x % 16 == 0); -   assert(y % 16 == 0); - -   eo[0] = tri->eo1 * 4; -   eo[1] = tri->eo2 * 4; -   eo[2] = tri->eo3 * 4; - -   c[0] = c0; -   c[1] = c1; -   c[2] = c2; - -   for (j = 0; j < 3; j++) { -      const int *step = tri->inputs.step[j]; -      const int cx = c[j] + eo[j]; - -      /* Mask has bits set whenever we are outside any of the edges. -       */ -      for (i = 0; i < 16; i++) { -         int out = cx + step[i] * 4; -         mask |= (out >> 31) & (1 << i); -      } -   } +#define TAG(x) x##_2 +#define NR_PLANES 2 +#include "lp_rast_tri_tmp.h" -   mask = ~mask & 0xffff; -   while (mask) { -      int i = ffs(mask) - 1; -      int px = x + pos_table4[i][0]; -      int py = y + pos_table4[i][1]; -      int cx1 = c0 + tri->inputs.step[0][i] * 4; -      int cx2 = c1 + tri->inputs.step[1][i] * 4; -      int cx3 = c2 + tri->inputs.step[2][i] * 4; +#define TAG(x) x##_3 +#define NR_PLANES 3 +#include "lp_rast_tri_tmp.h" -      mask &= ~(1 << i); +#define TAG(x) x##_4 +#define NR_PLANES 4 +#include "lp_rast_tri_tmp.h" -      /* Don't bother testing if the 4x4 block is entirely in/out of -       * the triangle.  It's a little faster to do it in the jit code. -       */ -      LP_COUNT(nr_non_empty_4); -      do_block_4(task, tri, px, py, cx1, cx2, cx3); -   } -} - - -/** - * Scan the tile in chunks and figure out which pixels to rasterize - * for this triangle. - */ -void -lp_rast_triangle(struct lp_rasterizer_task *task, -                 const union lp_rast_cmd_arg arg) -{ -   const struct lp_rast_triangle *tri = arg.triangle; -   const int x = task->x, y = task->y; -   int ei[3], eo[3], c[3]; -   unsigned outmask, inmask, partial_mask; -   unsigned i, j; - -   c[0] = tri->c1 + tri->dx12 * y - tri->dy12 * x; -   c[1] = tri->c2 + tri->dx23 * y - tri->dy23 * x; -   c[2] = tri->c3 + tri->dx31 * y - tri->dy31 * x; - -   eo[0] = tri->eo1 * 16; -   eo[1] = tri->eo2 * 16; -   eo[2] = tri->eo3 * 16; - -   ei[0] = tri->ei1 * 16; -   ei[1] = tri->ei2 * 16; -   ei[2] = tri->ei3 * 16; - -   outmask = 0; -   inmask = 0xffff; +#define TAG(x) x##_5 +#define NR_PLANES 5 +#include "lp_rast_tri_tmp.h" -   for (j = 0; j < 3; j++) { -      const int *step = tri->inputs.step[j]; -      const int cox = c[j] + eo[j]; -      const int cio = ei[j]- eo[j]; +#define TAG(x) x##_6 +#define NR_PLANES 6 +#include "lp_rast_tri_tmp.h" -      /* Outmask has bits set whenever we are outside any of the -       * edges. -       */ -      /* Inmask has bits set whenever we are inside all of the edges. -       */ -      for (i = 0; i < 16; i++) { -         int out = cox + step[i] * 16; -         int in = out + cio; -         outmask |= (out >> 31) & (1 << i); -         inmask &= ~((in >> 31) & (1 << i)); -      } -   } +#define TAG(x) x##_7 +#define NR_PLANES 7 +#include "lp_rast_tri_tmp.h" -   assert((outmask & inmask) == 0); - -   if (outmask == 0xffff) -      return; - -   /* Invert mask, so that bits are set whenever we are at least -    * partially inside all of the edges: -    */ -   partial_mask = ~inmask & ~outmask & 0xffff; - -   /* Iterate over partials: -    */ -   while (partial_mask) { -      int i = ffs(partial_mask) - 1; -      int px = x + pos_table16[i][0]; -      int py = y + pos_table16[i][1]; -      int cx1 = c[0] + tri->inputs.step[0][i] * 16; -      int cx2 = c[1] + tri->inputs.step[1][i] * 16; -      int cx3 = c[2] + tri->inputs.step[2][i] * 16; - -      partial_mask &= ~(1 << i); - -      LP_COUNT(nr_partially_covered_16); -      do_block_16(task, tri, px, py, cx1, cx2, cx3); -   } - -   /* Iterate over fulls:  -    */ -   while (inmask) { -      int i = ffs(inmask) - 1; -      int px = x + pos_table16[i][0]; -      int py = y + pos_table16[i][1]; - -      inmask &= ~(1 << i); - -      LP_COUNT(nr_fully_covered_16); -      block_full_16(task, tri, px, py); -   } -} diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h new file mode 100644 index 0000000000..a410c611a3 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -0,0 +1,238 @@ +/************************************************************************** + * + * Copyright 2007-2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Rasterization for binned triangles within a tile + */ + + + +/** + * Prototype for a 7 plane rasterizer function.  Will codegenerate + * several of these. + * + * XXX: Varients for more/fewer planes. + * XXX: Need ways of dropping planes as we descend. + * XXX: SIMD + */ +static void +TAG(do_block_4)(struct lp_rasterizer_task *task, +                const struct lp_rast_triangle *tri, +                const struct lp_rast_plane *plane, +                int x, int y, +                const int *c) +{ +   unsigned mask = 0; +   int i; + +   for (i = 0; i < 16; i++) { +      int any_negative = 0; +      int j; + +      for (j = 0; j < NR_PLANES; j++)  +         any_negative |= (c[j] - 1 + plane[j].step[i]); +          +      any_negative >>= 31; + +      mask |= (~any_negative) & (1 << i); +   } + +   /* Now pass to the shader: +    */ +   if (mask) +      lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask); +} + +/** + * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out + * of the triangle's bounds. + */ +static void +TAG(do_block_16)(struct lp_rasterizer_task *task, +                 const struct lp_rast_triangle *tri, +                 const struct lp_rast_plane *plane, +                 int x, int y, +                 const int *c) +{ +   unsigned outmask, inmask, partmask, partial_mask; +   unsigned i, j; + +   outmask = 0;                 /* outside one or more trivial reject planes */ +   partmask = 0;                /* outside one or more trivial accept planes */ + +   for (j = 0; j < NR_PLANES; j++) { +      const int *step = plane[j].step; +      const int eo = plane[j].eo * 4; +      const int ei = plane[j].ei * 4; +      const int cox = c[j] + eo; +      const int cio = ei - 1 - eo; + +      for (i = 0; i < 16; i++) { +         int out = cox + step[i] * 4; +         int part = out + cio; +         outmask  |= (out >> 31) & (1 << i); +         partmask |= (part >> 31) & (1 << i); +      } +   } + +   if (outmask == 0xffff) +      return; + +   /* Mask of sub-blocks which are inside all trivial accept planes: +    */ +   inmask = ~partmask & 0xffff; + +   /* Mask of sub-blocks which are inside all trivial reject planes, +    * but outside at least one trivial accept plane: +    */ +   partial_mask = partmask & ~outmask; + +   assert((partial_mask & inmask) == 0); + +   /* Iterate over partials: +    */ +   while (partial_mask) { +      int i = ffs(partial_mask) - 1; +      int px = x + pos_table4[i][0]; +      int py = y + pos_table4[i][1]; +      int cx[NR_PLANES]; + +      for (j = 0; j < NR_PLANES; j++) +         cx[j] = c[j] + plane[j].step[i] * 4; + +      partial_mask &= ~(1 << i); + +      TAG(do_block_4)(task, tri, plane, px, py, cx); +   } + +   /* Iterate over fulls:  +    */ +   while (inmask) { +      int i = ffs(inmask) - 1; +      int px = x + pos_table4[i][0]; +      int py = y + pos_table4[i][1]; + +      inmask &= ~(1 << i); + +      block_full_4(task, tri, px, py); +   } +} + + +/** + * Scan the tile in chunks and figure out which pixels to rasterize + * for this triangle. + */ +void +TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, +                      const union lp_rast_cmd_arg arg) +{ +   const struct lp_rast_triangle *tri = arg.triangle.tri; +   unsigned plane_mask = arg.triangle.plane_mask; +   const int x = task->x, y = task->y; +   struct lp_rast_plane plane[NR_PLANES]; +   int c[NR_PLANES]; +   unsigned outmask, inmask, partmask, partial_mask; +   unsigned i, j, nr_planes = 0; + +   while (plane_mask) { +      int i = ffs(plane_mask) - 1; +      plane[nr_planes] = tri->plane[i]; +      plane_mask &= ~(1 << i); +      nr_planes++; +   }; + +   assert(nr_planes == NR_PLANES); +   outmask = 0;                 /* outside one or more trivial reject planes */ +   partmask = 0;                /* outside one or more trivial accept planes */ + +   for (j = 0; j < NR_PLANES; j++) { +      const int *step = plane[j].step; +      const int eo = plane[j].eo * 16; +      const int ei = plane[j].ei * 16; +      int cox, cio; + +      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; +      cox = c[j] + eo; +      cio = ei - 1 - eo; + +      for (i = 0; i < 16; i++) { +         int out = cox + step[i] * 16; +         int part = out + cio; +         outmask  |= (out >> 31) & (1 << i); +         partmask |= (part >> 31) & (1 << i); +      } +   } + +   if (outmask == 0xffff) +      return; + +   /* Mask of sub-blocks which are inside all trivial accept planes: +    */ +   inmask = ~partmask & 0xffff; + +   /* Mask of sub-blocks which are inside all trivial reject planes, +    * but outside at least one trivial accept plane: +    */ +   partial_mask = partmask & ~outmask; + +   assert((partial_mask & inmask) == 0); + +   /* Iterate over partials: +    */ +   while (partial_mask) { +      int i = ffs(partial_mask) - 1; +      int px = x + pos_table16[i][0]; +      int py = y + pos_table16[i][1]; +      int cx[NR_PLANES]; + +      for (j = 0; j < NR_PLANES; j++) +         cx[j] = c[j] + plane[j].step[i] * 16; + +      partial_mask &= ~(1 << i); + +      LP_COUNT(nr_partially_covered_16); +      TAG(do_block_16)(task, tri, plane, px, py, cx); +   } + +   /* Iterate over fulls:  +    */ +   while (inmask) { +      int i = ffs(inmask) - 1; +      int px = x + pos_table16[i][0]; +      int py = y + pos_table16[i][1]; + +      inmask &= ~(1 << i); + +      LP_COUNT(nr_fully_covered_16); +      block_full_16(task, tri, px, py); +   } +} + +#undef TAG +#undef NR_PLANES + diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 3b83f4e742..40959e6208 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -287,7 +287,7 @@ lp_setup_flush( struct lp_setup_context *setup,            * data to linear in the texture_unmap() function, which will            * not be a parallel/threaded operation as here.            */ -         lp_scene_bin_everywhere(scene, lp_rast_store_color, dummy); +         lp_scene_bin_everywhere(scene, lp_rast_store_linear_color, dummy);        } @@ -752,28 +752,6 @@ lp_setup_update_state( struct lp_setup_context *setup )        setup->dirty |= LP_SETUP_NEW_FS;     } -   if (setup->dirty & LP_SETUP_NEW_SCISSOR) { -      float *stored; - -      stored = lp_scene_alloc_aligned(scene, 4 * sizeof(int32_t), 16); - -      if (stored) { -         stored[0] = (float) setup->scissor.current.minx; -         stored[1] = (float) setup->scissor.current.miny; -         stored[2] = (float) setup->scissor.current.maxx; -         stored[3] = (float) setup->scissor.current.maxy; - -         setup->scissor.stored = stored; - -         setup->fs.current.jit_context.scissor_xmin = stored[0]; -         setup->fs.current.jit_context.scissor_ymin = stored[1]; -         setup->fs.current.jit_context.scissor_xmax = stored[2]; -         setup->fs.current.jit_context.scissor_ymax = stored[3]; -      } - -      setup->dirty |= LP_SETUP_NEW_FS; -   } -     if(setup->dirty & LP_SETUP_NEW_CONSTANTS) {        struct pipe_resource *buffer = setup->constants.current; diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 8f4e00f073..0cea7791f5 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -130,7 +130,6 @@ struct lp_setup_context     struct {        struct pipe_scissor_state current; -      const void *stored;     } scissor;     unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 4e2e17f77b..036b5497fa 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -38,12 +38,78 @@  #define NUM_CHANNELS 4 +struct tri_info { + +   float pixel_offset; + +   /* fixed point vertex coordinates */ +   int x[3]; +   int y[3]; + +   /* float x,y deltas - all from the original coordinates +    */ +   float dy01, dy20; +   float dx01, dx20; +   float oneoverarea; + +   const float (*v0)[4]; +   const float (*v1)[4]; +   const float (*v2)[4]; + +   boolean frontfacing; +}; + + + +static const int step_scissor_minx[16] = { +   0, 1, 0, 1, +   2, 3, 2, 3, +   0, 1, 0, 1, +   2, 3, 2, 3 +}; + +static const int step_scissor_maxx[16] = { +    0, -1,  0, -1, +   -2, -3, -2, -3, +    0, -1,  0, -1, +   -2, -3, -2, -3 +}; + +static const int step_scissor_miny[16] = { +   0, 0, 1, 1, +   0, 0, 1, 1, +   2, 2, 3, 3, +   2, 2, 3, 3 +}; + +static const int step_scissor_maxy[16] = { +    0,  0, -1, -1, +    0,  0, -1, -1, +   -2, -2, -3, -3, +   -2, -2, -3, -3 +}; + + + +    +static INLINE int +subpixel_snap(float a) +{ +   return util_iround(FIXED_ONE * a); +} + +static INLINE float +fixed_to_float(int a) +{ +   return a * (1.0 / FIXED_ONE); +} + +  /**   * Compute a0 for a constant-valued coefficient (GL_FLAT shading).   */ -static void constant_coef( struct lp_setup_context *setup, -                           struct lp_rast_triangle *tri, +static void constant_coef( struct lp_rast_triangle *tri,                             unsigned slot,  			   const float value,                             unsigned i ) @@ -54,28 +120,21 @@ static void constant_coef( struct lp_setup_context *setup,  } -/** - * Compute a0, dadx and dady for a linearly interpolated coefficient, - * for a triangle. - */ -static void linear_coef( struct lp_setup_context *setup, -                         struct lp_rast_triangle *tri, -                         float oneoverarea, + +static void linear_coef( struct lp_rast_triangle *tri, +                         const struct tri_info *info,                           unsigned slot, -                         const float (*v1)[4], -                         const float (*v2)[4], -                         const float (*v3)[4],                           unsigned vert_attr,                           unsigned i)  { -   float a1 = v1[vert_attr][i]; -   float a2 = v2[vert_attr][i]; -   float a3 = v3[vert_attr][i]; +   float a0 = info->v0[vert_attr][i]; +   float a1 = info->v1[vert_attr][i]; +   float a2 = info->v2[vert_attr][i]; -   float da12 = a1 - a2; -   float da31 = a3 - a1; -   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea; -   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea; +   float da01 = a0 - a1; +   float da20 = a2 - a0; +   float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; +   float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea;     tri->inputs.dadx[slot][i] = dadx;     tri->inputs.dady[slot][i] = dady; @@ -92,9 +151,9 @@ static void linear_coef( struct lp_setup_context *setup,      * to define a0 as the sample at a pixel center somewhere near vmin      * instead - i'll switch to this later.      */ -   tri->inputs.a0[slot][i] = (a1 - -                              (dadx * (v1[0][0] - setup->pixel_offset) + -                               dady * (v1[0][1] - setup->pixel_offset))); +   tri->inputs.a0[slot][i] = (a0 - +                              (dadx * (info->v0[0][0] - info->pixel_offset) + +                               dady * (info->v0[0][1] - info->pixel_offset)));  } @@ -106,31 +165,27 @@ static void linear_coef( struct lp_setup_context *setup,   * Later, when we compute the value at a particular fragment position we'll   * divide the interpolated value by the interpolated W at that fragment.   */ -static void perspective_coef( struct lp_setup_context *setup, -                              struct lp_rast_triangle *tri, -                              float oneoverarea, +static void perspective_coef( struct lp_rast_triangle *tri, +                              const struct tri_info *info,                                unsigned slot, -			      const float (*v1)[4], -			      const float (*v2)[4], -			      const float (*v3)[4],  			      unsigned vert_attr,                                unsigned i)  {     /* premultiply by 1/w  (v[0][3] is always 1/w):      */ -   float a1 = v1[vert_attr][i] * v1[0][3]; -   float a2 = v2[vert_attr][i] * v2[0][3]; -   float a3 = v3[vert_attr][i] * v3[0][3]; -   float da12 = a1 - a2; -   float da31 = a3 - a1; -   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea; -   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea; +   float a0 = info->v0[vert_attr][i] * info->v0[0][3]; +   float a1 = info->v1[vert_attr][i] * info->v1[0][3]; +   float a2 = info->v2[vert_attr][i] * info->v2[0][3]; +   float da01 = a0 - a1; +   float da20 = a2 - a0; +   float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; +   float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea;     tri->inputs.dadx[slot][i] = dadx;     tri->inputs.dady[slot][i] = dady; -   tri->inputs.a0[slot][i] = (a1 - -                              (dadx * (v1[0][0] - setup->pixel_offset) + -                               dady * (v1[0][1] - setup->pixel_offset))); +   tri->inputs.a0[slot][i] = (a0 - +                              (dadx * (info->v0[0][0] - info->pixel_offset) + +                               dady * (info->v0[0][1] - info->pixel_offset)));  } @@ -141,13 +196,9 @@ static void perspective_coef( struct lp_setup_context *setup,   * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.   */  static void -setup_fragcoord_coef(struct lp_setup_context *setup, -                     struct lp_rast_triangle *tri, -                     float oneoverarea, +setup_fragcoord_coef(struct lp_rast_triangle *tri, +                     const struct tri_info *info,                       unsigned slot, -                     const float (*v1)[4], -                     const float (*v2)[4], -                     const float (*v3)[4],                       unsigned usage_mask)  {     /*X*/ @@ -166,12 +217,12 @@ setup_fragcoord_coef(struct lp_setup_context *setup,     /*Z*/     if (usage_mask & TGSI_WRITEMASK_Z) { -      linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 2); +      linear_coef(tri, info, slot, 0, 2);     }     /*W*/     if (usage_mask & TGSI_WRITEMASK_W) { -      linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 3); +      linear_coef(tri, info, slot, 0, 3);     }  } @@ -180,24 +231,23 @@ setup_fragcoord_coef(struct lp_setup_context *setup,   * Setup the fragment input attribute with the front-facing value.   * \param frontface  is the triangle front facing?   */ -static void setup_facing_coef( struct lp_setup_context *setup, -                               struct lp_rast_triangle *tri, +static void setup_facing_coef( struct lp_rast_triangle *tri,                                 unsigned slot,                                 boolean frontface,                                 unsigned usage_mask)  {     /* convert TRUE to 1.0 and FALSE to -1.0 */     if (usage_mask & TGSI_WRITEMASK_X) -      constant_coef( setup, tri, slot, 2.0f * frontface - 1.0f, 0 ); +      constant_coef( tri, slot, 2.0f * frontface - 1.0f, 0 );     if (usage_mask & TGSI_WRITEMASK_Y) -      constant_coef( setup, tri, slot, 0.0f, 1 ); /* wasted */ +      constant_coef( tri, slot, 0.0f, 1 ); /* wasted */     if (usage_mask & TGSI_WRITEMASK_Z) -      constant_coef( setup, tri, slot, 0.0f, 2 ); /* wasted */ +      constant_coef( tri, slot, 0.0f, 2 ); /* wasted */     if (usage_mask & TGSI_WRITEMASK_W) -      constant_coef( setup, tri, slot, 0.0f, 3 ); /* wasted */ +      constant_coef( tri, slot, 0.0f, 3 ); /* wasted */  } @@ -206,11 +256,7 @@ static void setup_facing_coef( struct lp_setup_context *setup,   */  static void setup_tri_coefficients( struct lp_setup_context *setup,  				    struct lp_rast_triangle *tri, -                                    float oneoverarea, -				    const float (*v1)[4], -				    const float (*v2)[4], -				    const float (*v3)[4], -				    boolean frontface) +                                    const struct tri_info *info)  {     unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;     unsigned slot; @@ -227,25 +273,25 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,           if (setup->flatshade_first) {              for (i = 0; i < NUM_CHANNELS; i++)                 if (usage_mask & (1 << i)) -                  constant_coef(setup, tri, slot+1, v1[vert_attr][i], i); +                  constant_coef(tri, slot+1, info->v0[vert_attr][i], i);           }           else {              for (i = 0; i < NUM_CHANNELS; i++)                 if (usage_mask & (1 << i)) -                  constant_coef(setup, tri, slot+1, v3[vert_attr][i], i); +                  constant_coef(tri, slot+1, info->v2[vert_attr][i], i);           }           break;        case LP_INTERP_LINEAR:           for (i = 0; i < NUM_CHANNELS; i++)              if (usage_mask & (1 << i)) -               linear_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i); +               linear_coef(tri, info, slot+1, vert_attr, i);           break;        case LP_INTERP_PERSPECTIVE:           for (i = 0; i < NUM_CHANNELS; i++)              if (usage_mask & (1 << i)) -               perspective_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i); +               perspective_coef(tri, info, slot+1, vert_attr, i);           fragcoord_usage_mask |= TGSI_WRITEMASK_W;           break; @@ -259,7 +305,7 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,           break;        case LP_INTERP_FACING: -         setup_facing_coef(setup, tri, slot+1, frontface, usage_mask); +         setup_facing_coef(tri, slot+1, info->frontfacing, usage_mask);           break;        default: @@ -269,16 +315,11 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,     /* The internal position input is in slot zero:      */ -   setup_fragcoord_coef(setup, tri, oneoverarea, 0, v1, v2, v3, -                        fragcoord_usage_mask); +   setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask);  } -static INLINE int subpixel_snap( float a ) -{ -   return util_iround(FIXED_ONE * a - (FIXED_ONE / 2)); -} @@ -291,21 +332,25 @@ static INLINE int subpixel_snap( float a )   * \return pointer to triangle space   */  static INLINE struct lp_rast_triangle * -alloc_triangle(struct lp_scene *scene, unsigned nr_inputs, unsigned *tri_size) +alloc_triangle(struct lp_scene *scene, +               unsigned nr_inputs, +               unsigned nr_planes, +               unsigned *tri_size)  {     unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);     struct lp_rast_triangle *tri; -   unsigned bytes; +   unsigned tri_bytes, bytes;     char *inputs;     assert(sizeof(*tri) % 16 == 0); -   bytes = sizeof(*tri) + (3 * input_array_sz); +   tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16); +   bytes = tri_bytes + (3 * input_array_sz);     tri = lp_scene_alloc_aligned( scene, bytes, 16 );     if (tri) { -      inputs = (char *) (tri + 1); +      inputs = ((char *)tri) + tri_bytes;        tri->inputs.a0   = (float (*)[4]) inputs;        tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);        tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz); @@ -329,52 +374,71 @@ print_triangle(struct lp_setup_context *setup,     uint i;     debug_printf("llvmpipe triangle\n"); -   for (i = 0; i < setup->fs.nr_inputs; i++) { +   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {        debug_printf("  v1[%d]:  %f %f %f %f\n", i,                     v1[i][0], v1[i][1], v1[i][2], v1[i][3]);     } -   for (i = 0; i < setup->fs.nr_inputs; i++) { +   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {        debug_printf("  v2[%d]:  %f %f %f %f\n", i,                     v2[i][0], v2[i][1], v2[i][2], v2[i][3]);     } -   for (i = 0; i < setup->fs.nr_inputs; i++) { +   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {        debug_printf("  v3[%d]:  %f %f %f %f\n", i,                     v3[i][0], v3[i][1], v3[i][2], v3[i][3]);     }  } +lp_rast_cmd lp_rast_tri_tab[8] = { +   NULL,               /* should be impossible */ +   lp_rast_triangle_1, +   lp_rast_triangle_2, +   lp_rast_triangle_3, +   lp_rast_triangle_4, +   lp_rast_triangle_5, +   lp_rast_triangle_6, +   lp_rast_triangle_7 +}; +  /**   * Do basic setup for triangle rasterization and determine which   * framebuffer tiles are touched.  Put the triangle in the scene's   * bins for the tiles which we overlap.   */ -static void  +static void  do_triangle_ccw(struct lp_setup_context *setup,  		const float (*v1)[4],  		const float (*v2)[4],  		const float (*v3)[4],  		boolean frontfacing )  { -   /* x/y positions in fixed point */ -   const int x1 = subpixel_snap(v1[0][0] + 0.5 - setup->pixel_offset); -   const int x2 = subpixel_snap(v2[0][0] + 0.5 - setup->pixel_offset); -   const int x3 = subpixel_snap(v3[0][0] + 0.5 - setup->pixel_offset); -   const int y1 = subpixel_snap(v1[0][1] + 0.5 - setup->pixel_offset); -   const int y2 = subpixel_snap(v2[0][1] + 0.5 - setup->pixel_offset); -   const int y3 = subpixel_snap(v3[0][1] + 0.5 - setup->pixel_offset);     struct lp_scene *scene = lp_setup_get_current_scene(setup); +   struct lp_fragment_shader_variant *variant = setup->fs.current.variant;     struct lp_rast_triangle *tri; +   struct tri_info info;     int area; -   float oneoverarea;     int minx, maxx, miny, maxy; +   int ix0, ix1, iy0, iy1;     unsigned tri_bytes; - +   int i; +   int nr_planes = 3; +           if (0)        print_triangle(setup, v1, v2, v3); -   tri = alloc_triangle(scene, setup->fs.nr_inputs, &tri_bytes); +   if (setup->scissor_test) { +      nr_planes = 7; +   } +   else { +      nr_planes = 3; +   } + + +   tri = alloc_triangle(scene, +                        setup->fs.nr_inputs, +                        nr_planes, +                        &tri_bytes);     if (!tri)        return; @@ -387,15 +451,24 @@ do_triangle_ccw(struct lp_setup_context *setup,     tri->v[2][1] = v3[0][1];  #endif -   tri->dx12 = x1 - x2; -   tri->dx23 = x2 - x3; -   tri->dx31 = x3 - x1; +   /* x/y positions in fixed point */ +   info.x[0] = subpixel_snap(v1[0][0] - setup->pixel_offset); +   info.x[1] = subpixel_snap(v2[0][0] - setup->pixel_offset); +   info.x[2] = subpixel_snap(v3[0][0] - setup->pixel_offset); +   info.y[0] = subpixel_snap(v1[0][1] - setup->pixel_offset); +   info.y[1] = subpixel_snap(v2[0][1] - setup->pixel_offset); +   info.y[2] = subpixel_snap(v3[0][1] - setup->pixel_offset); + +   tri->plane[0].dcdy = info.x[0] - info.x[1]; +   tri->plane[1].dcdy = info.x[1] - info.x[2]; +   tri->plane[2].dcdy = info.x[2] - info.x[0]; -   tri->dy12 = y1 - y2; -   tri->dy23 = y2 - y3; -   tri->dy31 = y3 - y1; +   tri->plane[0].dcdx = info.y[0] - info.y[1]; +   tri->plane[1].dcdx = info.y[1] - info.y[2]; +   tri->plane[2].dcdx = info.y[2] - info.y[0]; -   area = (tri->dx12 * tri->dy31 - tri->dx31 * tri->dy12); +   area = (tri->plane[0].dcdy * tri->plane[2].dcdx - +           tri->plane[2].dcdy * tri->plane[0].dcdx);     LP_COUNT(nr_tris); @@ -410,20 +483,35 @@ do_triangle_ccw(struct lp_setup_context *setup,     }     /* Bounding rectangle (in pixels) */ -   minx = (MIN3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER; -   maxx = (MAX3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER; -   miny = (MIN3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER; -   maxy = (MAX3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER; -    +   { +      /* Yes this is necessary to accurately calculate bounding boxes +       * with the two fill-conventions we support.  GL (normally) ends +       * up needing a bottom-left fill convention, which requires +       * slightly different rounding. +       */ +      int adj = (setup->pixel_offset != 0) ? 1 : 0; + +      minx = (MIN3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; +      maxx = (MAX3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; +      miny = (MIN3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; +      maxy = (MAX3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; +   } +     if (setup->scissor_test) {        minx = MAX2(minx, setup->scissor.current.minx);        maxx = MIN2(maxx, setup->scissor.current.maxx);        miny = MAX2(miny, setup->scissor.current.miny);        maxy = MIN2(maxy, setup->scissor.current.maxy);     } +   else { +      minx = MAX2(minx, 0); +      miny = MAX2(miny, 0); +      maxx = MIN2(maxx, scene->fb.width); +      maxy = MIN2(maxy, scene->fb.height); +   } + -   if (miny == maxy ||  -       minx == maxx) { +   if (miny >= maxy || minx >= maxx) {        lp_scene_putback_data( scene, tri_bytes );        LP_COUNT(nr_culled_tris);        return; @@ -431,75 +519,87 @@ do_triangle_ccw(struct lp_setup_context *setup,     /*       */ -   oneoverarea = ((float)FIXED_ONE) / (float)area; +   info.pixel_offset = setup->pixel_offset; +   info.v0 = v1; +   info.v1 = v2; +   info.v2 = v3; +   info.dx01 = info.v0[0][0] - info.v1[0][0]; +   info.dx20 = info.v2[0][0] - info.v0[0][0]; +   info.dy01 = info.v0[0][1] - info.v1[0][1]; +   info.dy20 = info.v2[0][1] - info.v0[0][1]; +   info.oneoverarea = 1.0 / (info.dx01 * info.dy20 - info.dx20 * info.dy01); +   info.frontfacing = frontfacing;     /* Setup parameter interpolants:      */ -   setup_tri_coefficients( setup, tri, oneoverarea, v1, v2, v3, frontfacing ); +   setup_tri_coefficients( setup, tri, &info );     tri->inputs.facing = frontfacing ? 1.0F : -1.0F; -   /* half-edge constants, will be interated over the whole render target. -    */ -   tri->c1 = tri->dy12 * x1 - tri->dx12 * y1; -   tri->c2 = tri->dy23 * x2 - tri->dx23 * y2; -   tri->c3 = tri->dy31 * x3 - tri->dx31 * y3; - -   /* correct for top-left fill convention: -    */ -   if (tri->dy12 < 0 || (tri->dy12 == 0 && tri->dx12 > 0)) tri->c1++; -   if (tri->dy23 < 0 || (tri->dy23 == 0 && tri->dx23 > 0)) tri->c2++; -   if (tri->dy31 < 0 || (tri->dy31 == 0 && tri->dx31 > 0)) tri->c3++; - -   tri->dy12 *= FIXED_ONE; -   tri->dy23 *= FIXED_ONE; -   tri->dy31 *= FIXED_ONE; -   tri->dx12 *= FIXED_ONE; -   tri->dx23 *= FIXED_ONE; -   tri->dx31 *= FIXED_ONE; +   +   for (i = 0; i < 3; i++) { +      struct lp_rast_plane *plane = &tri->plane[i]; -   /* find trivial reject offsets for each edge for a single-pixel -    * sized block.  These will be scaled up at each recursive level to -    * match the active blocksize.  Scaling in this way works best if -    * the blocks are square. -    */ -   tri->eo1 = 0; -   if (tri->dy12 < 0) tri->eo1 -= tri->dy12; -   if (tri->dx12 > 0) tri->eo1 += tri->dx12; +      /* half-edge constants, will be interated over the whole render +       * target. +       */ +      plane->c = plane->dcdx * info.x[i] - plane->dcdy * info.y[i]; -   tri->eo2 = 0; -   if (tri->dy23 < 0) tri->eo2 -= tri->dy23; -   if (tri->dx23 > 0) tri->eo2 += tri->dx23; +      /* correct for top-left vs. bottom-left fill convention.   +       * +       * note that we're overloading gl_rasterization_rules to mean +       * both (0.5,0.5) pixel centers *and* bottom-left filling +       * convention. +       * +       * GL actually has a top-left filling convention, but GL's +       * notion of "top" differs from gallium's... +       * +       * Also, sometimes (in FBO cases) GL will render upside down +       * to its usual method, in which case it will probably want +       * to use the opposite, top-left convention. +       */          +      if (plane->dcdx < 0) { +         /* both fill conventions want this - adjust for left edges */ +         plane->c++;             +      } +      else if (plane->dcdx == 0) { +         if (setup->pixel_offset == 0) { +            /* correct for top-left fill convention: +             */ +            if (plane->dcdy > 0) plane->c++; +         } +         else { +            /* correct for bottom-left fill convention: +             */ +            if (plane->dcdy < 0) plane->c++; +         } +      } -   tri->eo3 = 0; -   if (tri->dy31 < 0) tri->eo3 -= tri->dy31; -   if (tri->dx31 > 0) tri->eo3 += tri->dx31; +      plane->dcdx *= FIXED_ONE; +      plane->dcdy *= FIXED_ONE; -   /* Calculate trivial accept offsets from the above. -    */ -   tri->ei1 = tri->dx12 - tri->dy12 - tri->eo1; -   tri->ei2 = tri->dx23 - tri->dy23 - tri->eo2; -   tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3; +      /* find trivial reject offsets for each edge for a single-pixel +       * sized block.  These will be scaled up at each recursive level to +       * match the active blocksize.  Scaling in this way works best if +       * the blocks are square. +       */ +      plane->eo = 0; +      if (plane->dcdx < 0) plane->eo -= plane->dcdx; +      if (plane->dcdy > 0) plane->eo += plane->dcdy; -   /* Fill in the inputs.step[][] arrays. -    * We've manually unrolled some loops here. -    */ -   { -      const int xstep1 = -tri->dy12; -      const int xstep2 = -tri->dy23; -      const int xstep3 = -tri->dy31; -      const int ystep1 = tri->dx12; -      const int ystep2 = tri->dx23; -      const int ystep3 = tri->dx31; +      /* Calculate trivial accept offsets from the above. +       */ +      plane->ei = plane->dcdy - plane->dcdx - plane->eo; -#define SETUP_STEP(i, x, y)                                \ -      do {                                                 \ -         tri->inputs.step[0][i] = x * xstep1 + y * ystep1; \ -         tri->inputs.step[1][i] = x * xstep2 + y * ystep2; \ -         tri->inputs.step[2][i] = x * xstep3 + y * ystep3; \ -      } while (0) +      plane->step = tri->step[i]; +      /* Fill in the inputs.step[][] arrays. +       * We've manually unrolled some loops here. +       */ +#define SETUP_STEP(j, x, y) \ +      tri->step[i][j] = y * plane->dcdy - x * plane->dcdx +              SETUP_STEP(0, 0, 0);        SETUP_STEP(1, 1, 0);        SETUP_STEP(2, 0, 1); @@ -522,63 +622,106 @@ do_triangle_ccw(struct lp_setup_context *setup,  #undef STEP     } + +   /*  +    * When rasterizing scissored tris, use the intersection of the +    * triangle bounding box and the scissor rect to generate the +    * scissor planes. +    * +    * This permits us to cut off the triangle "tails" that are present +    * in the intermediate recursive levels caused when two of the +    * triangles edges don't diverge quickly enough to trivially reject +    * exterior blocks from the triangle. +    * +    * It's not really clear if it's worth worrying about these tails, +    * but since we generate the planes for each scissored tri, it's +    * free to trim them in this case. +    *  +    * Note that otherwise, the scissor planes only vary in 'C' value, +    * and even then only on state-changes.  Could alternatively store +    * these planes elsewhere. +    */ +   if (nr_planes == 7) { +      tri->plane[3].step = step_scissor_minx; +      tri->plane[3].dcdx = -1; +      tri->plane[3].dcdy = 0; +      tri->plane[3].c = 1-minx; +      tri->plane[3].ei = 0; +      tri->plane[3].eo = 1; + +      tri->plane[4].step = step_scissor_maxx; +      tri->plane[4].dcdx = 1; +      tri->plane[4].dcdy = 0; +      tri->plane[4].c = maxx; +      tri->plane[4].ei = -1; +      tri->plane[4].eo = 0; + +      tri->plane[5].step = step_scissor_miny; +      tri->plane[5].dcdx = 0; +      tri->plane[5].dcdy = 1; +      tri->plane[5].c = 1-miny; +      tri->plane[5].ei = 0; +      tri->plane[5].eo = 1; + +      tri->plane[6].step = step_scissor_maxy; +      tri->plane[6].dcdx = 0; +      tri->plane[6].dcdy = -1; +      tri->plane[6].c = maxy; +      tri->plane[6].ei = -1; +      tri->plane[6].eo = 0; +   } + +     /*      * All fields of 'tri' are now set.  The remaining code here is      * concerned with binning.      */ -   /* Convert to tile coordinates: +   /* Convert to tile coordinates, and inclusive ranges:      */ -   minx = minx / TILE_SIZE; -   miny = miny / TILE_SIZE; -   maxx = maxx / TILE_SIZE; -   maxy = maxy / TILE_SIZE; +   ix0 = minx / TILE_SIZE; +   iy0 = miny / TILE_SIZE; +   ix1 = (maxx-1) / TILE_SIZE; +   iy1 = (maxy-1) / TILE_SIZE;     /*      * Clamp to framebuffer size      */ -   minx = MAX2(minx, 0); -   miny = MAX2(miny, 0); -   maxx = MIN2(maxx, scene->tiles_x - 1); -   maxy = MIN2(maxy, scene->tiles_y - 1); +   assert(ix0 == MAX2(ix0, 0)); +   assert(iy0 == MAX2(iy0, 0)); +   assert(ix1 == MIN2(ix1, scene->tiles_x - 1)); +   assert(iy1 == MIN2(iy1, scene->tiles_y - 1));     /* Determine which tile(s) intersect the triangle's bounding box      */ -   if (miny == maxy && minx == maxx) +   if (iy0 == iy1 && ix0 == ix1)     {        /* Triangle is contained in a single tile:         */ -      lp_scene_bin_command( scene, minx, miny, lp_rast_triangle,  -			    lp_rast_arg_triangle(tri) ); +      lp_scene_bin_command( scene, ix0, iy0, +                            lp_rast_tri_tab[nr_planes],  +			    lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );     } -   else  +   else     { -      int c1 = (tri->c1 +  -                tri->dx12 * miny * TILE_SIZE -  -                tri->dy12 * minx * TILE_SIZE); -      int c2 = (tri->c2 +  -                tri->dx23 * miny * TILE_SIZE - -                tri->dy23 * minx * TILE_SIZE); -      int c3 = (tri->c3 + -                tri->dx31 * miny * TILE_SIZE - -                tri->dy31 * minx * TILE_SIZE); - -      int ei1 = tri->ei1 << TILE_ORDER; -      int ei2 = tri->ei2 << TILE_ORDER; -      int ei3 = tri->ei3 << TILE_ORDER; - -      int eo1 = tri->eo1 << TILE_ORDER; -      int eo2 = tri->eo2 << TILE_ORDER; -      int eo3 = tri->eo3 << TILE_ORDER; +      int c[7]; +      int ei[7]; +      int eo[7]; +      int xstep[7]; +      int ystep[7]; +      int x, y; +       +      for (i = 0; i < nr_planes; i++) { +         c[i] = (tri->plane[i].c +  +                 tri->plane[i].dcdy * iy0 * TILE_SIZE -  +                 tri->plane[i].dcdx * ix0 * TILE_SIZE); -      int xstep1 = -(tri->dy12 << TILE_ORDER); -      int xstep2 = -(tri->dy23 << TILE_ORDER); -      int xstep3 = -(tri->dy31 << TILE_ORDER); +         ei[i] = tri->plane[i].ei << TILE_ORDER; +         eo[i] = tri->plane[i].eo << TILE_ORDER; +         xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER); +         ystep[i] = tri->plane[i].dcdy << TILE_ORDER; +      } -      int ystep1 = tri->dx12 << TILE_ORDER; -      int ystep2 = tri->dx23 << TILE_ORDER; -      int ystep3 = tri->dx31 << TILE_ORDER; -      int x, y;        /* Test tile-sized blocks against the triangle. @@ -586,32 +729,49 @@ do_triangle_ccw(struct lp_setup_context *setup,         * contained inside the tri, bin an lp_rast_shade_tile command.         * Else, bin a lp_rast_triangle command.         */ -      for (y = miny; y <= maxy; y++) +      for (y = iy0; y <= iy1; y++)        { -	 int cx1 = c1; -	 int cx2 = c2; -	 int cx3 = c3;  	 boolean in = FALSE;  /* are we inside the triangle? */ +	 int cx[7]; -	 for (x = minx; x <= maxx; x++) +         for (i = 0; i < nr_planes; i++) +            cx[i] = c[i]; + +	 for (x = ix0; x <= ix1; x++)  	 { -	    if (cx1 + eo1 < 0 ||  -		cx2 + eo2 < 0 || -		cx3 + eo3 < 0)  -	    { -	       /* do nothing */ +            int out = 0; +            int partial = 0; + +            for (i = 0; i < nr_planes; i++) { +               int planeout = cx[i] + eo[i]; +               int planepartial = cx[i] + ei[i] - 1; +               out |= (planeout >> 31); +               partial |= (planepartial >> 31) & (1<<i); +            } + +            if (out) { +               /* do nothing */ +               if (in) +                  break;  /* exiting triangle, all done with this row */                 LP_COUNT(nr_empty_64); -	       if (in) -		  break;  /* exiting triangle, all done with this row */ -	    } -	    else if (cx1 + ei1 > 0 && -		     cx2 + ei2 > 0 && -		     cx3 + ei3 > 0)  -	    { +            } +            else if (partial) { +               /* Not trivially accepted by at least one plane -  +                * rasterize/shade partial tile +                */ +               int count = util_bitcount(partial); +               in = TRUE; +               lp_scene_bin_command( scene, x, y, +                                     lp_rast_tri_tab[count],  +                                     lp_rast_arg_triangle(tri, partial) ); + +               LP_COUNT(nr_partially_covered_64); +            } +            else {                 /* triangle covers the whole tile- shade whole tile */                 LP_COUNT(nr_fully_covered_64); -	       in = TRUE; -	       if (setup->fs.current.variant->opaque && +               in = TRUE; +	       if (variant->opaque &&  	           !setup->fb.zsbuf) {  	          lp_scene_bin_reset( scene, x, y );  	          lp_scene_bin_command( scene, x, y, @@ -621,29 +781,18 @@ do_triangle_ccw(struct lp_setup_context *setup,                 lp_scene_bin_command( scene, x, y,  				     lp_rast_shade_tile,  				     lp_rast_arg_inputs(&tri->inputs) ); -	    } -	    else  -	    {  -               /* rasterizer/shade partial tile */ -               LP_COUNT(nr_partially_covered_64); -	       in = TRUE; -               lp_scene_bin_command( scene, x, y, -				     lp_rast_triangle,  -				     lp_rast_arg_triangle(tri) ); -	    } +            }  	    /* Iterate cx values across the region:  	     */ -	    cx1 += xstep1; -	    cx2 += xstep2; -	    cx3 += xstep3; +            for (i = 0; i < nr_planes; i++) +               cx[i] += xstep[i];  	 }  	 /* Iterate c values down the region:  	  */ -	 c1 += ystep1; -	 c2 += ystep2; -	 c3 += ystep3;     +         for (i = 0; i < nr_planes; i++) +            c[i] += ystep[i];        }     }  } diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 65115052cd..5953d690a4 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -31,9 +31,6 @@   * Code generate the whole fragment pipeline.   *   * The fragment pipeline consists of the following stages: - * - triangle edge in/out testing - * - scissor test - * - stipple (TBI)   * - early depth test   * - fragment shader   * - alpha test @@ -97,6 +94,7 @@  #include "lp_state.h"  #include "lp_tex_sample.h"  #include "lp_flush.h" +#include "lp_state_fs.h"  #include <llvm-c/Analysis.h> @@ -170,177 +168,63 @@ generate_depth_stencil(LLVMBuilderRef builder,  /** - * Generate the code to do inside/outside triangle testing for the + * Expand the relevent bits of mask_input to a 4-dword mask for the    * four pixels in a 2x2 quad.  This will set the four elements of the   * quad mask vector to 0 or ~0. - * \param i  which quad of the quad group to test, in [0,3] + * + * \param quad  which quad of the quad group to test, in [0,3] + * \param mask_input  bitwise mask for the whole 4x4 stamp   */ -static void -generate_tri_edge_mask(LLVMBuilderRef builder, -                       unsigned i, -                       LLVMValueRef *mask,      /* ivec4, out */ -                       LLVMValueRef c0,         /* int32 */ -                       LLVMValueRef c1,         /* int32 */ -                       LLVMValueRef c2,         /* int32 */ -                       LLVMValueRef step0_ptr,  /* ivec4 */ -                       LLVMValueRef step1_ptr,  /* ivec4 */ -                       LLVMValueRef step2_ptr)  /* ivec4 */ +static LLVMValueRef +generate_quad_mask(LLVMBuilderRef builder, +                   struct lp_type fs_type, +                   unsigned quad, +                   LLVMValueRef mask_input) /* int32 */  { -#define OPTIMIZE_IN_OUT_TEST 0 -#if OPTIMIZE_IN_OUT_TEST -   struct lp_build_if_state ifctx; -   LLVMValueRef not_draw_all; -#endif -   struct lp_build_flow_context *flow; -   struct lp_type i32_type; -   LLVMTypeRef i32vec4_type; -   LLVMValueRef c0_vec, c1_vec, c2_vec; -   LLVMValueRef in_out_mask; - -   assert(i < 4); -    -   /* int32 vector type */ -   memset(&i32_type, 0, sizeof i32_type); -   i32_type.floating = FALSE; /* values are integers */ -   i32_type.sign = TRUE;      /* values are signed */ -   i32_type.norm = FALSE;     /* values are not normalized */ -   i32_type.width = 32;       /* 32-bit int values */ -   i32_type.length = 4;       /* 4 elements per vector */ - -   i32vec4_type = lp_build_int32_vec4_type(); +   struct lp_type mask_type; +   LLVMTypeRef i32t = LLVMInt32Type(); +   LLVMValueRef bits[4]; +   LLVMValueRef mask;     /* -    * Use a conditional here to do detailed pixel in/out testing. -    * We only have to do this if c0 != INT_MIN. +    * XXX: We'll need a different path for 16 x u8      */ -   flow = lp_build_flow_create(builder); -   lp_build_flow_scope_begin(flow); - -   { -#if OPTIMIZE_IN_OUT_TEST -      /* not_draw_all = (c0 != INT_MIN) */ -      not_draw_all = LLVMBuildICmp(builder, -                                   LLVMIntNE, -                                   c0, -                                   LLVMConstInt(LLVMInt32Type(), INT_MIN, 0), -                                   ""); - -      in_out_mask = lp_build_const_int_vec(i32_type, ~0); +   assert(fs_type.width == 32); +   assert(fs_type.length == 4); +   mask_type = lp_int_type(fs_type); - -      lp_build_flow_scope_declare(flow, &in_out_mask); - -      /* if (not_draw_all) {... */ -      lp_build_if(&ifctx, flow, builder, not_draw_all); -#endif -      { -         LLVMValueRef step0_vec, step1_vec, step2_vec; -         LLVMValueRef m0_vec, m1_vec, m2_vec; -         LLVMValueRef index, m; - -         /* c0_vec = {c0, c0, c0, c0} -          * Note that we emit this code four times but LLVM optimizes away -          * three instances of it. -          */ -         c0_vec = lp_build_broadcast(builder, i32vec4_type, c0); -         c1_vec = lp_build_broadcast(builder, i32vec4_type, c1); -         c2_vec = lp_build_broadcast(builder, i32vec4_type, c2); -         lp_build_name(c0_vec, "edgeconst0vec"); -         lp_build_name(c1_vec, "edgeconst1vec"); -         lp_build_name(c2_vec, "edgeconst2vec"); - -         /* load step0vec, step1, step2 vec from memory */ -         index = LLVMConstInt(LLVMInt32Type(), i, 0); -         step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), ""); -         step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), ""); -         step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), ""); -         lp_build_name(step0_vec, "step0vec"); -         lp_build_name(step1_vec, "step1vec"); -         lp_build_name(step2_vec, "step2vec"); - -         /* m0_vec = step0_ptr[i] > c0_vec */ -         m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec); -         m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec); -         m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec); - -         /* in_out_mask = m0_vec & m1_vec & m2_vec */ -         m = LLVMBuildAnd(builder, m0_vec, m1_vec, ""); -         in_out_mask = LLVMBuildAnd(builder, m, m2_vec, ""); -         lp_build_name(in_out_mask, "inoutmaskvec"); -      } -#if OPTIMIZE_IN_OUT_TEST -      lp_build_endif(&ifctx); -#endif - -   } -   lp_build_flow_scope_end(flow); -   lp_build_flow_destroy(flow); - -   /* This is the initial alive/dead pixel mask for a quad of four pixels. -    * It's an int[4] vector with each word set to 0 or ~0. -    * Words will get cleared when pixels faile the Z test, etc. +   /* +    * mask_input >>= (quad * 4)      */ -   *mask = in_out_mask; -} - - -static LLVMValueRef -generate_scissor_test(LLVMBuilderRef builder, -                      LLVMValueRef context_ptr, -                      const struct lp_build_interp_soa_context *interp, -                      struct lp_type type) -{ -   LLVMTypeRef vec_type = lp_build_vec_type(type); -   LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1]; -   LLVMValueRef xmin, ymin, xmax, ymax; -   LLVMValueRef m0, m1, m2, m3, m; - -   /* xpos, ypos contain the window coords for the four pixels in the quad */ -   assert(xpos); -   assert(ypos); - -   /* get the current scissor bounds, convert to vectors */ -   xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr); -   xmin = lp_build_broadcast(builder, vec_type, xmin); - -   ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr); -   ymin = lp_build_broadcast(builder, vec_type, ymin); -   xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr); -   xmax = lp_build_broadcast(builder, vec_type, xmax); +   mask_input = LLVMBuildLShr(builder, +                              mask_input, +                              LLVMConstInt(i32t, quad * 4, 0), +                              ""); -   ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr); -   ymax = lp_build_broadcast(builder, vec_type, ymax); - -   /* compare the fragment's position coordinates against the scissor bounds */ -   m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin); -   m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin); -   m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax); -   m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax); +   /* +    * mask = { mask_input & (1 << i), for i in [0,3] } +    */ -   /* AND all the masks together */ -   m = LLVMBuildAnd(builder, m0, m1, ""); -   m = LLVMBuildAnd(builder, m, m2, ""); -   m = LLVMBuildAnd(builder, m, m3, ""); +   mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input); -   lp_build_name(m, "scissormask"); +   bits[0] = LLVMConstInt(i32t, 1 << 0, 0); +   bits[1] = LLVMConstInt(i32t, 1 << 1, 0); +   bits[2] = LLVMConstInt(i32t, 1 << 2, 0); +   bits[3] = LLVMConstInt(i32t, 1 << 3, 0); -   return m; -} +   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), ""); +   /* +    * mask = mask != 0 ? ~0 : 0 +    */ -static LLVMValueRef -build_int32_vec_const(int value) -{ -   struct lp_type i32_type; +   mask = lp_build_compare(builder, +                           mask_type, PIPE_FUNC_NOTEQUAL, +                           mask, +                           lp_build_const_int_vec(mask_type, 0)); -   memset(&i32_type, 0, sizeof i32_type); -   i32_type.floating = FALSE; /* values are integers */ -   i32_type.sign = TRUE;      /* values are signed */ -   i32_type.norm = FALSE;     /* values are not normalized */ -   i32_type.width = 32;       /* 32-bit int values */ -   i32_type.length = 4;       /* 4 elements per vector */ -   return lp_build_const_int_vec(i32_type, value); +   return mask;  } @@ -348,7 +232,7 @@ build_int32_vec_const(int value)  /**   * Generate the fragment shader, depth/stencil test, and alpha tests.   * \param i  which quad in the tile, in range [0,3] - * \param do_tri_test  if 1, do triangle edge in/out testing + * \param partial_mask  if 1, do mask_input testing   */  static void  generate_fs(struct llvmpipe_context *lp, @@ -364,13 +248,8 @@ generate_fs(struct llvmpipe_context *lp,              LLVMValueRef (*color)[4],              LLVMValueRef depth_ptr,              LLVMValueRef facing, -            unsigned do_tri_test, -            LLVMValueRef c0, -            LLVMValueRef c1, -            LLVMValueRef c2, -            LLVMValueRef step0_ptr, -            LLVMValueRef step1_ptr, -            LLVMValueRef step2_ptr, +            unsigned partial_mask, +            LLVMValueRef mask_input,              LLVMValueRef counter)  {     const struct tgsi_token *tokens = shader->base.tokens; @@ -411,23 +290,17 @@ generate_fs(struct llvmpipe_context *lp,     lp_build_flow_scope_declare(flow, &z);     /* do triangle edge testing */ -   if (do_tri_test) { -      generate_tri_edge_mask(builder, i, pmask, -                             c0, c1, c2, step0_ptr, step1_ptr, step2_ptr); +   if (partial_mask) { +      *pmask = generate_quad_mask(builder, type, +                                  i, mask_input);     }     else { -      *pmask = build_int32_vec_const(~0); +      *pmask = lp_build_const_int_vec(type, ~0);     }     /* 'mask' will control execution based on quad's pixel alive/killed state */     lp_build_mask_begin(&mask, flow, type, *pmask); -   if (key->scissor) { -      LLVMValueRef smask = -         generate_scissor_test(builder, context_ptr, interp, type); -      lp_build_mask_update(&mask, smask); -   } -     early_depth_stencil_test =        (key->depth.enabled || key->stencil[0].enabled) &&        !key->alpha.enabled && @@ -579,7 +452,7 @@ static void  generate_fragment(struct llvmpipe_context *lp,                    struct lp_fragment_shader *shader,                    struct lp_fragment_shader_variant *variant, -                  unsigned do_tri_test) +                  unsigned partial_mask)  {     struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);     const struct lp_fragment_shader_variant_key *key = &variant->key; @@ -589,9 +462,8 @@ generate_fragment(struct llvmpipe_context *lp,     LLVMTypeRef fs_elem_type;     LLVMTypeRef fs_int_vec_type;     LLVMTypeRef blend_vec_type; -   LLVMTypeRef arg_types[16]; +   LLVMTypeRef arg_types[11];     LLVMTypeRef func_type; -   LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type();     LLVMValueRef context_ptr;     LLVMValueRef x;     LLVMValueRef y; @@ -600,7 +472,8 @@ generate_fragment(struct llvmpipe_context *lp,     LLVMValueRef dady_ptr;     LLVMValueRef color_ptr_ptr;     LLVMValueRef depth_ptr; -   LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr, counter = NULL; +   LLVMValueRef mask_input; +   LLVMValueRef counter = NULL;     LLVMBasicBlockRef block;     LLVMBuilderRef builder;     struct lp_build_sampler_soa *sampler; @@ -645,7 +518,7 @@ generate_fragment(struct llvmpipe_context *lp,     blend_vec_type = lp_build_vec_type(blend_type);     util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",  -		 shader->no, variant->no, do_tri_test ? "edge" : "whole"); +		 shader->no, variant->no, partial_mask ? "partial" : "whole");     arg_types[0] = screen->context_ptr_type;            /* context */     arg_types[1] = LLVMInt32Type();                     /* x */ @@ -656,23 +529,15 @@ generate_fragment(struct llvmpipe_context *lp,     arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */     arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */     arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */ -   arg_types[9] = LLVMInt32Type();                     /* c0 */ -   arg_types[10] = LLVMInt32Type();                    /* c1 */ -   arg_types[11] = LLVMInt32Type();                    /* c2 */ -   /* Note: the step arrays are built as int32[16] but we interpret -    * them here as int32_vec4[4]. -    */ -   arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step0 */ -   arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step1 */ -   arg_types[14] = LLVMPointerType(int32_vec4_type, 0);/* step2 */ -   arg_types[15] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */ +   arg_types[9] = LLVMInt32Type();                     /* mask_input */ +   arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */     func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);     function = LLVMAddFunction(screen->module, func_name, func_type);     LLVMSetFunctionCallConv(function, LLVMCCallConv); -   variant->function[do_tri_test] = function; +   variant->function[partial_mask] = function;     /* XXX: need to propagate noalias down into color param now we are @@ -691,12 +556,7 @@ generate_fragment(struct llvmpipe_context *lp,     dady_ptr     = LLVMGetParam(function, 6);     color_ptr_ptr = LLVMGetParam(function, 7);     depth_ptr    = LLVMGetParam(function, 8); -   c0           = LLVMGetParam(function, 9); -   c1           = LLVMGetParam(function, 10); -   c2           = LLVMGetParam(function, 11); -   step0_ptr    = LLVMGetParam(function, 12); -   step1_ptr    = LLVMGetParam(function, 13); -   step2_ptr    = LLVMGetParam(function, 14); +   mask_input   = LLVMGetParam(function, 9);     lp_build_name(context_ptr, "context");     lp_build_name(x, "x"); @@ -706,15 +566,10 @@ generate_fragment(struct llvmpipe_context *lp,     lp_build_name(dady_ptr, "dady");     lp_build_name(color_ptr_ptr, "color_ptr_ptr");     lp_build_name(depth_ptr, "depth"); -   lp_build_name(c0, "c0"); -   lp_build_name(c1, "c1"); -   lp_build_name(c2, "c2"); -   lp_build_name(step0_ptr, "step0"); -   lp_build_name(step1_ptr, "step1"); -   lp_build_name(step2_ptr, "step2"); +   lp_build_name(mask_input, "mask_input");     if (key->occlusion_count) { -      counter = LLVMGetParam(function, 15); +      counter = LLVMGetParam(function, 10);        lp_build_name(counter, "counter");     } @@ -763,9 +618,9 @@ generate_fragment(struct llvmpipe_context *lp,                    out_color,                    depth_ptr_i,                    facing, -                  do_tri_test, -                  c0, c1, c2, -                  step0_ptr, step1_ptr, step2_ptr, counter); +                  partial_mask, +                  mask_input, +                  counter);        for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++)  	 for(chan = 0; chan < NUM_CHANNELS; ++chan) @@ -792,9 +647,13 @@ generate_fragment(struct llvmpipe_context *lp,  	 lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);        } -      lp_build_conv_mask(builder, fs_type, blend_type, -			 fs_mask, num_fs, -			 &blend_mask, 1); +      if (partial_mask || !variant->opaque) { +         lp_build_conv_mask(builder, fs_type, blend_type, +                            fs_mask, num_fs, +                            &blend_mask, 1); +      } else { +         blend_mask = lp_build_const_int_vec(blend_type, ~0); +      }        color_ptr = LLVMBuildLoad(builder,   				LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""), @@ -832,8 +691,7 @@ generate_fragment(struct llvmpipe_context *lp,  #endif     /* Apply optimizations to LLVM IR */ -   if (1) -      LLVMRunFunctionPassManager(screen->pass, function); +   LLVMRunFunctionPassManager(screen->pass, function);     if (gallivm_debug & GALLIVM_DEBUG_IR) {        /* Print the LLVM IR to stderr */ @@ -847,7 +705,7 @@ generate_fragment(struct llvmpipe_context *lp,     {        void *f = LLVMGetPointerToGlobal(screen->engine, function); -      variant->jit_function[do_tri_test] = (lp_jit_frag_func)pointer_to_func(f); +      variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);        if (gallivm_debug & GALLIVM_DEBUG_ASM) {           lp_disassemble(f); @@ -963,7 +821,6 @@ generate_variant(struct llvmpipe_context *lp,           !key->stencil[0].enabled &&           !key->alpha.enabled &&           !key->depth.enabled && -         !key->scissor &&           !shader->info.uses_kill           ? TRUE : FALSE; @@ -1182,7 +1039,6 @@ make_variant_key(struct llvmpipe_context *lp,     /* alpha.ref_value is passed in jit_context */     key->flatshade = lp->rasterizer->flatshade; -   key->scissor = lp->rasterizer->scissor;     if (lp->active_query_count) {        key->occlusion_count = TRUE;     } diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 593cd4de6b..37900fc544 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -54,7 +54,6 @@ struct lp_fragment_shader_variant_key     enum pipe_format zsbuf_format;     unsigned nr_cbufs:8;     unsigned flatshade:1; -   unsigned scissor:1;     unsigned occlusion_count:1;     struct {  | 
