From ab9438193083b7f9a3180cb9cea45e269131048a Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 16 Dec 2009 16:02:59 -0700 Subject: llvmpipe: do final the pixel in/out triangle test in the fragment shader The test to determine which of the pixels in a 2x2 quad is now done in the fragment shader rather than in the calling C code. This is a little faster but there's a few more things to do. Note that the step[] array elements are in a different order now. Rather than being in row-major order for the 4x4 grid, they're in "quad-major" order. The setup of the step arrays is a little more complicated now. So is the course/intermediate tile test code, but some lookup tables help with that. Next steps: - early-cull 2x2 quads which are totally outside the triangle. - skip the in/out test for fully contained quads - make the in/out comparison code tighter/faster. --- src/gallium/drivers/llvmpipe/lp_jit.h | 9 +- src/gallium/drivers/llvmpipe/lp_rast.c | 76 +++------- src/gallium/drivers/llvmpipe/lp_rast.h | 11 +- src/gallium/drivers/llvmpipe/lp_rast_priv.h | 11 +- src/gallium/drivers/llvmpipe/lp_rast_tri.c | 222 +++++++++++++++------------- src/gallium/drivers/llvmpipe/lp_setup_tri.c | 49 +++--- src/gallium/drivers/llvmpipe/lp_state_fs.c | 144 ++++++++++++++++-- 7 files changed, 302 insertions(+), 220 deletions(-) (limited to 'src/gallium/drivers/llvmpipe') diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 7eccb5da85..e8fb7d990f 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -114,9 +114,14 @@ typedef void const void *a0, const void *dadx, const void *dady, - const uint32_t *mask, void *color, - void *depth); + void *depth, + const int32_t c1, + const int32_t c2, + const int32_t c3, + const int32_t *step1, + const int32_t *step2, + const int32_t *step3); diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index ec87d907b8..b1bd27d340 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -25,6 +25,7 @@ * **************************************************************************/ +#include #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_cpu_detect.h" @@ -279,6 +280,8 @@ void lp_rast_shade_tile( struct lp_rasterizer *rast, unsigned thread_index, const union lp_rast_cmd_arg arg ) { + /* Set c1,c2,c3 to large values so the in/out test always passes */ + const int32_t c1 = INT_MAX/2, c2 = INT_MAX/2, c3 = INT_MAX/2; const struct lp_rast_shader_inputs *inputs = arg.shade_tile; const unsigned tile_x = rast->tasks[thread_index].x; const unsigned tile_y = rast->tasks[thread_index].y; @@ -296,7 +299,7 @@ void lp_rast_shade_tile( struct lp_rasterizer *rast, inputs, tile_x + x, tile_y + y, - mask); + c1, c2, c3); } @@ -308,58 +311,25 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast, unsigned thread_index, const struct lp_rast_shader_inputs *inputs, unsigned x, unsigned y, - unsigned mask) + int32_t c1, int32_t c2, int32_t c3) { -#if 1 const struct lp_rast_state *state = rast->tasks[thread_index].current_state; struct lp_rast_tile *tile = &rast->tasks[thread_index].tile; void *color; void *depth; - uint32_t ALIGN16_ATTRIB masks[2][2][2][2]; unsigned ix, iy; int block_offset; +#ifdef DEBUG assert(state); /* Sanity checks */ assert(x % TILE_VECTOR_WIDTH == 0); assert(y % TILE_VECTOR_HEIGHT == 0); - /* mask: the rasterizer wants to treat pixels in 4x4 blocks, but - * the pixel shader wants to swizzle them into 4 2x2 quads. - * - * Additionally, the pixel shader wants masks as full dword ~0, - * while the rasterizer wants to pack per-pixel bits tightly. - */ -#if 0 - unsigned qx, qy; - for (qy = 0; qy < 2; ++qy) - for (qx = 0; qx < 2; ++qx) - for (iy = 0; iy < 2; ++iy) - for (ix = 0; ix < 2; ++ix) - masks[qy][qx][iy][ix] = mask & (1 << (qy*8+iy*4+qx*2+ix)) ? ~0 : 0; -#else - masks[0][0][0][0] = mask & (1 << (0*8+0*4+0*2+0)) ? ~0 : 0; - masks[0][0][0][1] = mask & (1 << (0*8+0*4+0*2+1)) ? ~0 : 0; - masks[0][0][1][0] = mask & (1 << (0*8+1*4+0*2+0)) ? ~0 : 0; - masks[0][0][1][1] = mask & (1 << (0*8+1*4+0*2+1)) ? ~0 : 0; - masks[0][1][0][0] = mask & (1 << (0*8+0*4+1*2+0)) ? ~0 : 0; - masks[0][1][0][1] = mask & (1 << (0*8+0*4+1*2+1)) ? ~0 : 0; - masks[0][1][1][0] = mask & (1 << (0*8+1*4+1*2+0)) ? ~0 : 0; - masks[0][1][1][1] = mask & (1 << (0*8+1*4+1*2+1)) ? ~0 : 0; - - masks[1][0][0][0] = mask & (1 << (1*8+0*4+0*2+0)) ? ~0 : 0; - masks[1][0][0][1] = mask & (1 << (1*8+0*4+0*2+1)) ? ~0 : 0; - masks[1][0][1][0] = mask & (1 << (1*8+1*4+0*2+0)) ? ~0 : 0; - masks[1][0][1][1] = mask & (1 << (1*8+1*4+0*2+1)) ? ~0 : 0; - masks[1][1][0][0] = mask & (1 << (1*8+0*4+1*2+0)) ? ~0 : 0; - masks[1][1][0][1] = mask & (1 << (1*8+0*4+1*2+1)) ? ~0 : 0; - masks[1][1][1][0] = mask & (1 << (1*8+1*4+1*2+0)) ? ~0 : 0; - masks[1][1][1][1] = mask & (1 << (1*8+1*4+1*2+1)) ? ~0 : 0; -#endif - assert((x % 4) == 0); assert((y % 4) == 0); +#endif ix = x % TILE_SIZE; iy = y % TILE_SIZE; @@ -373,39 +343,27 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast, /* depth buffer */ depth = tile->depth + block_offset; - /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */ - assert(lp_check_alignment(masks, 16)); - +#ifdef DEBUG assert(lp_check_alignment(depth, 16)); assert(lp_check_alignment(color, 16)); assert(lp_check_alignment(state->jit_context.blend_color, 16)); + assert(lp_check_alignment(inputs->step[0], 16)); + assert(lp_check_alignment(inputs->step[1], 16)); + assert(lp_check_alignment(inputs->step[2], 16)); +#endif + /* run shader */ state->jit_function( &state->jit_context, x, y, inputs->a0, inputs->dadx, inputs->dady, - &masks[0][0][0][0], color, - depth); -#else - struct lp_rast_tile *tile = &rast->tile; - unsigned chan_index; - unsigned q, ix, iy; - - x %= TILE_SIZE; - y %= TILE_SIZE; - - /* mask */ - for (q = 0; q < 4; ++q) - for(iy = 0; iy < 2; ++iy) - for(ix = 0; ix < 2; ++ix) - if(masks[q] & (1 << (iy*2 + ix))) - for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index) - TILE_PIXEL(tile->color, x + q*2 + ix, y + iy, chan_index) = 0xff; - -#endif + depth, + c1, c2, c3, + inputs->step[0], inputs->step[1], inputs->step[2] + ); } diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index 2dd0193d8d..46e22f69a6 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -80,6 +80,9 @@ struct lp_rast_shader_inputs { float (*a0)[4]; float (*dadx)[4]; float (*dady)[4]; + + /* edge/step info for 3 edges and 4x4 block of pixels */ + int ALIGN16_ATTRIB step[3][16]; }; @@ -117,14 +120,10 @@ struct lp_rast_triangle { int dx31; /* edge function values at minx,miny ?? */ - int c1; - int c2; - int c3; - - int step[3][16]; + int c1, c2, c3; /* inputs for the shader */ - struct lp_rast_shader_inputs inputs; + struct lp_rast_shader_inputs ALIGN16_ATTRIB inputs; }; diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index 79a90f6610..cd72d7e69d 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -61,15 +61,6 @@ struct lp_rasterizer_task unsigned x, y; /**< Pos of this tile in framebuffer, in pixels */ - /* Pixel blocks produced during rasterization - */ - unsigned nr_blocks; - struct { - unsigned x; - unsigned y; - unsigned mask; - } blocks[256]; - const struct lp_rast_state *current_state; /** "back" pointer */ @@ -133,6 +124,6 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast, unsigned thread_index, const struct lp_rast_shader_inputs *inputs, unsigned x, unsigned y, - unsigned masks); + int32_t c1, int32_t c2, int32_t c3); #endif diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 6c96010c52..9b1861223a 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -29,6 +29,7 @@ * Rasterization for binned triangles within a tile */ +#include #include "util/u_math.h" #include "lp_debug.h" #include "lp_rast_priv.h" @@ -36,42 +37,89 @@ /** - * Add a 4x4 block of pixels to the block list. - * All pixels are known to be inside the triangle's bounds. + * Map an index in [0,15] to an x,y position, multiplied by 4. + * This is used to get the position of each subtile in a 4x4 + * grid of edge step values. + */ +static const int pos_table4[16][2] = { + { 0, 0 }, + { 4, 0 }, + { 0, 4 }, + { 4, 4 }, + { 8, 0 }, + { 12, 0 }, + { 8, 4 }, + { 12, 4 }, + { 0, 8 }, + { 4, 8 }, + { 0, 12 }, + { 4, 12 }, + { 8, 8 }, + { 12, 8 }, + { 8, 12 }, + { 12, 12 } +}; + + +static const int pos_table16[16][2] = { + { 0, 0 }, + { 16, 0 }, + { 0, 16 }, + { 16, 16 }, + { 32, 0 }, + { 48, 0 }, + { 32, 16 }, + { 48, 16 }, + { 0, 32 }, + { 16, 32 }, + { 0, 48 }, + { 16, 48 }, + { 32, 32 }, + { 48, 32 }, + { 32, 48 }, + { 48, 48 } +}; + + +/** + * Shade all pixels in a 4x4 block. */ static void -block_full_4( struct lp_rasterizer_task *rast_task, int x, int y ) +block_full_4( struct lp_rasterizer_task *rast_task, + const struct lp_rast_triangle *tri, + int x, int y ) { - const unsigned i = rast_task->nr_blocks; - assert(x % 4 == 0); - assert(y % 4 == 0); - rast_task->blocks[i].x = x; - rast_task->blocks[i].y = y; - rast_task->blocks[i].mask = ~0; - rast_task->nr_blocks++; + /* Set c1,c2,c3 to large values so the in/out test always passes */ + const int32_t c1 = INT_MAX/2, c2 = INT_MAX/2, c3 = INT_MAX/2; + lp_rast_shade_quads(rast_task->rast, + rast_task->thread_index, + &tri->inputs, + x, y, + c1, c2, c3); } /** - * Add a 16x16 block of pixels to the block list. - * All pixels are known to be inside the triangle's bounds. + * Shade all pixels in a 16x16 block. */ static void -block_full_16( struct lp_rasterizer_task *rast_task, int x, int y ) +block_full_16( struct lp_rasterizer_task *rast_task, + const struct lp_rast_triangle *tri, + int x, int y ) { unsigned ix, iy; assert(x % 16 == 0); assert(y % 16 == 0); for (iy = 0; iy < 16; iy += 4) for (ix = 0; ix < 16; ix += 4) - block_full_4(rast_task, x + ix, y + iy); + block_full_4(rast_task, tri, x + ix, y + iy); } /** - * Evaluate each pixel in a 4x4 block to determine if it lies within - * the triangle's bounds. - * Generate a mask of in/out flags and add the block to the blocks list. + * Pass the 4x4 pixel block to the shader function. + * Determination of which of the 16 pixels lies inside the triangle + * will be done as part of the fragment shader. */ static void do_block_4( struct lp_rasterizer_task *rast_task, @@ -81,28 +129,11 @@ do_block_4( struct lp_rasterizer_task *rast_task, int c2, int c3 ) { - int i; - unsigned mask = 0; - - assert(x % 4 == 0); - assert(y % 4 == 0); - - for (i = 0; i < 16; i++) { - int any_negative = ((c1 + tri->step[0][i]) | - (c2 + tri->step[1][i]) | - (c3 + tri->step[2][i])) >> 31; - mask |= (~any_negative) & (1 << i); - } - - /* As we do trivial reject already, masks should rarely be all zero: - */ - if (mask) { - const unsigned i = rast_task->nr_blocks; - rast_task->blocks[i].x = x; - rast_task->blocks[i].y = y; - rast_task->blocks[i].mask = mask; - rast_task->nr_blocks++; - } + lp_rast_shade_quads(rast_task->rast, + rast_task->thread_index, + &tri->inputs, + x, y, + c1, c2, c3); } @@ -118,40 +149,42 @@ do_block_16( struct lp_rasterizer_task *rast_task, int c2, int c3 ) { - int ix, iy, i = 0; + const int ei1 = tri->ei1 * 4; + const int ei2 = tri->ei2 * 4; + const int ei3 = tri->ei3 * 4; - int ei1 = tri->ei1 * 4; - int ei2 = tri->ei2 * 4; - int ei3 = tri->ei3 * 4; + const int eo1 = tri->eo1 * 4; + const int eo2 = tri->eo2 * 4; + const int eo3 = tri->eo3 * 4; - int eo1 = tri->eo1 * 4; - int eo2 = tri->eo2 * 4; - int eo3 = tri->eo3 * 4; + int i; assert(x % 16 == 0); assert(y % 16 == 0); - for (iy = 0; iy < 16; iy+=4) { - for (ix = 0; ix < 16; ix+=4, i++) { - int cx1 = c1 + (tri->step[0][i] * 4); - int cx2 = c2 + (tri->step[1][i] * 4); - int cx3 = c3 + (tri->step[2][i] * 4); - - if (cx1 + eo1 < 0 || - cx2 + eo2 < 0 || - cx3 + eo3 < 0) { - /* the block is completely outside the triangle - nop */ - } - else if (cx1 + ei1 > 0 && - cx2 + ei2 > 0 && - cx3 + ei3 > 0) { + for (i = 0; i < 16; i++) { + int cx1 = c1 + (tri->inputs.step[0][i] * 4); + int cx2 = c2 + (tri->inputs.step[1][i] * 4); + int cx3 = c3 + (tri->inputs.step[2][i] * 4); + + if (cx1 + eo1 < 0 || + cx2 + eo2 < 0 || + cx3 + eo3 < 0) { + /* the block is completely outside the triangle - nop */ + } + else { + int px = x + pos_table4[i][0]; + int py = y + pos_table4[i][1]; + if (cx1 + ei1 > 0 && + cx2 + ei2 > 0 && + cx3 + ei3 > 0) { /* the block is completely inside the triangle */ - block_full_4(rast_task, x+ix, y+iy); - } - else { + block_full_4(rast_task, tri, px, py); + } + else { /* the block is partially in/out of the triangle */ - do_block_4(rast_task, tri, x+ix, y+iy, cx1, cx2, cx3); - } + do_block_4(rast_task, tri, px, py, cx1, cx2, cx3); + } } } } @@ -171,8 +204,7 @@ lp_rast_triangle( struct lp_rasterizer *rast, int x = rast_task->x; int y = rast_task->y; - int ix, iy; - unsigned i = 0; + unsigned i; int c1 = tri->c1 + tri->dx12 * y - tri->dy12 * x; int c2 = tri->c2 + tri->dx23 * y - tri->dy23 * x; @@ -186,48 +218,36 @@ lp_rast_triangle( struct lp_rasterizer *rast, int eo2 = tri->eo2 * 16; int eo3 = tri->eo3 * 16; - assert(Elements(rast_task->blocks) == (TILE_SIZE * TILE_SIZE) / (4*4)); - LP_DBG(DEBUG_RAST, "lp_rast_triangle\n"); - rast_task->nr_blocks = 0; - /* Walk over the tile to build a list of 4x4 pixel blocks which will * be filled/shaded. We do this at two granularities: 16x16 blocks * and then 4x4 blocks. */ - for (iy = 0; iy < TILE_SIZE; iy += 16) { - for (ix = 0; ix < TILE_SIZE; ix += 16, i++) { - int cx1 = c1 + (tri->step[0][i] * 16); - int cx2 = c2 + (tri->step[1][i] * 16); - int cx3 = c3 + (tri->step[2][i] * 16); - - if (cx1 + eo1 < 0 || - cx2 + eo2 < 0 || - cx3 + eo3 < 0) { - /* the block is completely outside the triangle - nop */ - } - else if (cx1 + ei1 > 0 && - cx2 + ei2 > 0 && - cx3 + ei3 > 0) { + for (i = 0; i < 16; i++) { + int cx1 = c1 + (tri->inputs.step[0][i] * 16); + int cx2 = c2 + (tri->inputs.step[1][i] * 16); + int cx3 = c3 + (tri->inputs.step[2][i] * 16); + + if (cx1 + eo1 < 0 || + cx2 + eo2 < 0 || + cx3 + eo3 < 0) { + /* the block is completely outside the triangle - nop */ + } + else { + int px = x + pos_table16[i][0]; + int py = y + pos_table16[i][1]; + + if (cx1 + ei1 > 0 && + cx2 + ei2 > 0 && + cx3 + ei3 > 0) { /* the block is completely inside the triangle */ - block_full_16(rast_task, x+ix, y+iy); - } - else { + block_full_16(rast_task, tri, px, py); + } + else { /* the block is partially in/out of the triangle */ - do_block_16(rast_task, tri, x+ix, y+iy, cx1, cx2, cx3); - } + do_block_16(rast_task, tri, px, py, cx1, cx2, cx3); + } } } - - assert(rast_task->nr_blocks <= Elements(rast_task->blocks)); - - /* Shade the 4x4 pixel blocks */ - for (i = 0; i < rast_task->nr_blocks; i++) - lp_rast_shade_quads(rast, - thread_index, - &tri->inputs, - rast_task->blocks[i].x, - rast_task->blocks[i].y, - rast_task->blocks[i].mask); } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index aeaf260af2..e15b987767 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -265,7 +265,7 @@ do_triangle_ccw(struct setup_context *setup, const int y3 = subpixel_snap(v3[0][1]); struct lp_scene *scene = lp_setup_get_current_scene(setup); - struct lp_rast_triangle *tri = lp_scene_alloc( scene, sizeof *tri ); + struct lp_rast_triangle *tri = lp_scene_alloc_aligned( scene, sizeof *tri, 16 ); float area, oneoverarea; int minx, maxx, miny, maxy; @@ -354,38 +354,29 @@ do_triangle_ccw(struct setup_context *setup, tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3; { - int xstep1 = -tri->dy12; - int xstep2 = -tri->dy23; - int xstep3 = -tri->dy31; + const int xstep1 = -tri->dy12; + const int xstep2 = -tri->dy23; + const int xstep3 = -tri->dy31; - int ystep1 = tri->dx12; - int ystep2 = tri->dx23; - int ystep3 = tri->dx31; + const int ystep1 = tri->dx12; + const int ystep2 = tri->dx23; + const int ystep3 = tri->dx31; - int ix, iy; + int qx, qy, ix, iy; int i = 0; - int c1 = 0; - int c2 = 0; - int c3 = 0; - - for (iy = 0; iy < 4; iy++) { - int cx1 = c1; - int cx2 = c2; - int cx3 = c3; - - for (ix = 0; ix < 4; ix++, i++) { - tri->step[0][i] = cx1; - tri->step[1][i] = cx2; - tri->step[2][i] = cx3; - cx1 += xstep1; - cx2 += xstep2; - cx3 += xstep3; - } - - c1 += ystep1; - c2 += ystep2; - c3 += ystep3; + for (qy = 0; qy < 2; qy++) { + for (qx = 0; qx < 2; qx++) { + for (iy = 0; iy < 2; iy++) { + for (ix = 0; ix < 2; ix++, i++) { + int x = qx * 2 + ix; + int y = qy * 2 + iy; + tri->inputs.step[0][i] = x * xstep1 + y * ystep1; + tri->inputs.step[1][i] = x * xstep2 + y * ystep2; + tri->inputs.step[2][i] = x * xstep3 + y * ystep3; + } + } + } } } diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index c0d5a70a55..4af37e365e 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -175,8 +175,93 @@ generate_depth(LLVMBuilderRef builder, } +/** + * Generate the code to do inside/outside triangle testing for the + * four pixels in a 2x2 quad. This will set the four elements of the + * quad mask vector to 0 or ~0. + * \param i which quad of the quad group to test, in [0,3] + */ +static void +generate_tri_edge_mask(LLVMBuilderRef builder, + unsigned i, + LLVMValueRef *mask, /* ivec4, out */ + LLVMValueRef c0, /* int32 */ + LLVMValueRef c1, /* int32 */ + LLVMValueRef c2, /* int32 */ + LLVMValueRef step0_ptr, /* ivec4 */ + LLVMValueRef step1_ptr, /* ivec4 */ + LLVMValueRef step2_ptr) /* ivec4 */ +{ + /* + c0_vec = splat(c0) + c1_vec = splat(c1) + c2_vec = splat(c2) + s0_vec = c0_vec + step0_ptr[i] + s1_vec = c1_vec + step1_ptr[i] + s2_vec = c2_vec + step2_ptr[i] + m0_vec = s0_vec > {0,0,0,0} + m1_vec = s1_vec > {0,0,0,0} + m2_vec = s2_vec > {0,0,0,0} + mask = m0_vec & m1_vec & m2_vec + */ + struct lp_type i32_type; + LLVMTypeRef i32vec4_type; + + LLVMValueRef index; + LLVMValueRef c0_vec, c1_vec, c2_vec; + LLVMValueRef step0_vec, step1_vec, step2_vec; + LLVMValueRef m0_vec, m1_vec, m2_vec; + LLVMValueRef s0_vec, s1_vec, s2_vec; + LLVMValueRef m; + + LLVMValueRef zeros; + + assert(i < 4); + + /* int32 vector type */ + memset(&i32_type, 0, sizeof i32_type); + i32_type.floating = FALSE; /* values are integers */ + i32_type.sign = TRUE; /* values are signed */ + i32_type.norm = FALSE; /* values are not normalized */ + i32_type.width = 32; /* 32-bit int values */ + i32_type.length = 4; /* 4 elements per vector */ + + i32vec4_type = lp_build_int32_vec4_type(); + + /* int32_vec4 zero = {0,0,0,0} */ + zeros = LLVMConstNull(i32vec4_type); + + c0_vec = lp_build_broadcast(builder, i32vec4_type, c0); + c1_vec = lp_build_broadcast(builder, i32vec4_type, c1); + c2_vec = lp_build_broadcast(builder, i32vec4_type, c2); + + index = LLVMConstInt(LLVMInt32Type(), i, 0); + step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), ""); + step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), ""); + step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), ""); + + /** XXX with a little work, we could remove the add here and just + * compare c0_vec > step0_vec. + */ + s0_vec = LLVMBuildAdd(builder, c0_vec, step0_vec, ""); + s1_vec = LLVMBuildAdd(builder, c1_vec, step1_vec, ""); + s2_vec = LLVMBuildAdd(builder, c2_vec, step2_vec, ""); + m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, s0_vec, zeros); + m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, s1_vec, zeros); + m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, s2_vec, zeros); + + m = LLVMBuildAnd(builder, m0_vec, m1_vec, ""); + m = LLVMBuildAnd(builder, m, m2_vec, ""); + + lp_build_name(m, "m"); + + *mask = m; +} + + /** * Generate the fragment shader, depth/stencil test, and alpha tests. + * \param i which quad in the tile, in range [0,3] */ static void generate_fs(struct llvmpipe_context *lp, @@ -190,7 +275,13 @@ generate_fs(struct llvmpipe_context *lp, struct lp_build_sampler_soa *sampler, LLVMValueRef *pmask, LLVMValueRef *color, - LLVMValueRef depth_ptr) + LLVMValueRef depth_ptr, + LLVMValueRef c0, + LLVMValueRef c1, + LLVMValueRef c2, + LLVMValueRef step0_ptr, + LLVMValueRef step1_ptr, + LLVMValueRef step2_ptr) { const struct tgsi_token *tokens = shader->base.tokens; LLVMTypeRef elem_type; @@ -205,6 +296,8 @@ generate_fs(struct llvmpipe_context *lp, unsigned attrib; unsigned chan; + assert(i < 4); + elem_type = lp_build_elem_type(type); vec_type = lp_build_vec_type(type); int_vec_type = lp_build_int_vec_type(type); @@ -224,8 +317,13 @@ generate_fs(struct llvmpipe_context *lp, } lp_build_flow_scope_declare(flow, &z); + /* do triangle edge testing */ + generate_tri_edge_mask(builder, i, pmask, + c0, c1, c2, step0_ptr, step1_ptr, step2_ptr); + lp_build_mask_begin(&mask, flow, type, *pmask); + early_depth_test = key->depth.enabled && !key->alpha.enabled && @@ -376,17 +474,18 @@ generate_fragment(struct llvmpipe_context *lp, LLVMTypeRef fs_int_vec_type; LLVMTypeRef blend_vec_type; LLVMTypeRef blend_int_vec_type; - LLVMTypeRef arg_types[9]; + LLVMTypeRef arg_types[14]; LLVMTypeRef func_type; + LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type(); LLVMValueRef context_ptr; LLVMValueRef x; LLVMValueRef y; LLVMValueRef a0_ptr; LLVMValueRef dadx_ptr; LLVMValueRef dady_ptr; - LLVMValueRef mask_ptr; LLVMValueRef color_ptr; LLVMValueRef depth_ptr; + LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr; LLVMBasicBlockRef block; LLVMBuilderRef builder; LLVMValueRef x0; @@ -468,9 +567,17 @@ generate_fragment(struct llvmpipe_context *lp, arg_types[3] = LLVMPointerType(fs_elem_type, 0); /* a0 */ arg_types[4] = LLVMPointerType(fs_elem_type, 0); /* dadx */ arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* dady */ - arg_types[6] = LLVMPointerType(fs_int_vec_type, 0); /* mask */ - arg_types[7] = LLVMPointerType(blend_vec_type, 0); /* color */ - arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */ + arg_types[6] = LLVMPointerType(blend_vec_type, 0); /* color */ + arg_types[7] = LLVMPointerType(fs_int_vec_type, 0); /* depth */ + arg_types[8] = LLVMInt32Type(); /* c0 */ + arg_types[9] = LLVMInt32Type(); /* c1 */ + arg_types[10] = LLVMInt32Type(); /* c2 */ + /* Note: the step arrays are built as int32[16] but we interpret + * them here as int32_vec4[4]. + */ + arg_types[11] = LLVMPointerType(int32_vec4_type, 0);/* step0 */ + arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step1 */ + arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step2 */ func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); @@ -486,9 +593,14 @@ generate_fragment(struct llvmpipe_context *lp, a0_ptr = LLVMGetParam(variant->function, 3); dadx_ptr = LLVMGetParam(variant->function, 4); dady_ptr = LLVMGetParam(variant->function, 5); - mask_ptr = LLVMGetParam(variant->function, 6); - color_ptr = LLVMGetParam(variant->function, 7); - depth_ptr = LLVMGetParam(variant->function, 8); + color_ptr = LLVMGetParam(variant->function, 6); + depth_ptr = LLVMGetParam(variant->function, 7); + c0 = LLVMGetParam(variant->function, 8); + c1 = LLVMGetParam(variant->function, 9); + c2 = LLVMGetParam(variant->function, 10); + step0_ptr = LLVMGetParam(variant->function, 11); + step1_ptr = LLVMGetParam(variant->function, 12); + step2_ptr = LLVMGetParam(variant->function, 13); lp_build_name(context_ptr, "context"); lp_build_name(x, "x"); @@ -496,9 +608,14 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(a0_ptr, "a0"); lp_build_name(dadx_ptr, "dadx"); lp_build_name(dady_ptr, "dady"); - lp_build_name(mask_ptr, "mask"); lp_build_name(color_ptr, "color"); lp_build_name(depth_ptr, "depth"); + lp_build_name(c0, "c0"); + lp_build_name(c1, "c1"); + lp_build_name(c2, "c2"); + lp_build_name(step0_ptr, "step0"); + lp_build_name(step1_ptr, "step1"); + lp_build_name(step2_ptr, "step2"); /* * Function body @@ -526,7 +643,6 @@ generate_fragment(struct llvmpipe_context *lp, if(i != 0) lp_build_interp_soa_update(&interp, i); - fs_mask[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, mask_ptr, &index, 1, ""), ""); depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, ""); generate_fs(lp, shader, key, @@ -536,9 +652,11 @@ generate_fragment(struct llvmpipe_context *lp, i, &interp, sampler, - &fs_mask[i], + &fs_mask[i], /* output */ out_color, - depth_ptr_i); + depth_ptr_i, + c0, c1, c2, + step0_ptr, step1_ptr, step2_ptr); for(chan = 0; chan < NUM_CHANNELS; ++chan) fs_out_color[chan][i] = out_color[chan]; -- cgit v1.2.3