diff options
Diffstat (limited to 'src/gallium/drivers/llvmpipe')
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_bld_interp.c | 17 | ||||
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_context.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast.h | 9 | ||||
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_tri.c | 190 | ||||
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h | 112 | ||||
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_setup_tri.c | 115 | ||||
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_tile_soa.py | 29 |
7 files changed, 261 insertions, 213 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c index 78744da500..2cf6f38c4b 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c @@ -141,7 +141,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld, else { dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), ""); dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), ""); - dadxy = LLVMBuildAdd(builder, dadx, dady, ""); + dadxy = LLVMBuildFAdd(builder, dadx, dady, ""); attrib_name(dadx, attrib, chan, ".dadx"); attrib_name(dady, attrib, chan, ".dady"); attrib_name(dadxy, attrib, chan, ".dadxy"); @@ -177,7 +177,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld, * dadq2 = 2 * dq */ - dadq2 = LLVMBuildAdd(builder, dadq, dadq, ""); + dadq2 = LLVMBuildFAdd(builder, dadq, dadq, ""); /* * a = a0 + x * dadx + y * dady @@ -193,12 +193,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld, a = a0; if (interp != LP_INTERP_CONSTANT && interp != LP_INTERP_FACING) { - a = LLVMBuildAdd(builder, a, - LLVMBuildMul(builder, bld->x, dadx, ""), - ""); - a = LLVMBuildAdd(builder, a, - LLVMBuildMul(builder, bld->y, dady, ""), - ""); + LLVMValueRef tmp; + tmp = LLVMBuildFMul(builder, bld->x, dadx, ""); + a = LLVMBuildFAdd(builder, a, tmp, ""); + tmp = LLVMBuildFMul(builder, bld->y, dady, ""); + a = LLVMBuildFAdd(builder, a, tmp, ""); } } @@ -212,7 +211,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld, * Compute the attrib values on the upper-left corner of each quad. */ - a = LLVMBuildAdd(builder, a, dadq2, ""); + a = LLVMBuildFAdd(builder, a, dadq2, ""); /* * a *= 1 / w diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c index 28793682ed..7543bd7b2b 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.c +++ b/src/gallium/drivers/llvmpipe/lp_context.c @@ -47,7 +47,7 @@ #include "lp_setup.h" -DEBUG_GET_ONCE_BOOL_OPTION(lp_no_rast, "LP_NO_RAST", FALSE); +DEBUG_GET_ONCE_BOOL_OPTION(lp_no_rast, "LP_NO_RAST", FALSE) static void llvmpipe_destroy( struct pipe_context *pipe ) diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index eaf2a6f334..102e902d02 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -104,9 +104,6 @@ struct lp_rast_plane { int dcdx; int dcdy; - - /* edge/step info for 3 edges and 4x4 block of pixels */ - const int *step; }; /** @@ -119,8 +116,6 @@ struct lp_rast_triangle { /* inputs for the shader */ struct lp_rast_shader_inputs inputs; - int step[3][16]; - #ifdef DEBUG float v[3][2]; #endif @@ -261,5 +256,9 @@ void lp_rast_begin_query(struct lp_rasterizer_task *, void lp_rast_end_query(struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); +void +lp_rast_triangle_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg); + #endif diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index ebe9a8e92b..673f67386b 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -37,52 +37,6 @@ #include "lp_tile_soa.h" -/** - * Map an index in [0,15] to an x,y position, multiplied by 4. - * This is used to get the position of each subtile in a 4x4 - * grid of edge step values. - * Note: we can use some bit twiddling to compute these values instead - * of using a look-up table, but there's no measurable performance - * difference. - */ -static const int pos_table4[16][2] = { - { 0, 0 }, - { 4, 0 }, - { 0, 4 }, - { 4, 4 }, - { 8, 0 }, - { 12, 0 }, - { 8, 4 }, - { 12, 4 }, - { 0, 8 }, - { 4, 8 }, - { 0, 12 }, - { 4, 12 }, - { 8, 8 }, - { 12, 8 }, - { 8, 12 }, - { 12, 12 } -}; - - -static const int pos_table16[16][2] = { - { 0, 0 }, - { 16, 0 }, - { 0, 16 }, - { 16, 16 }, - { 32, 0 }, - { 48, 0 }, - { 32, 16 }, - { 48, 16 }, - { 0, 32 }, - { 16, 32 }, - { 0, 48 }, - { 16, 48 }, - { 32, 32 }, - { 48, 32 }, - { 32, 48 }, - { 48, 48 } -}; /** @@ -113,6 +67,68 @@ block_full_16(struct lp_rasterizer_task *task, block_full_4(task, tri, x + ix, y + iy); } + +static INLINE unsigned +build_mask(int c, int dcdx, int dcdy) +{ + int mask = 0; + + int c0 = c; + int c1 = c0 + dcdx; + int c2 = c1 + dcdx; + int c3 = c2 + dcdx; + + mask |= ((c0 + 0 * dcdy) >> 31) & (1 << 0); + mask |= ((c0 + 1 * dcdy) >> 31) & (1 << 2); + mask |= ((c0 + 2 * dcdy) >> 31) & (1 << 8); + mask |= ((c0 + 3 * dcdy) >> 31) & (1 << 10); + mask |= ((c1 + 0 * dcdy) >> 31) & (1 << 1); + mask |= ((c1 + 1 * dcdy) >> 31) & (1 << 3); + mask |= ((c1 + 2 * dcdy) >> 31) & (1 << 9); + mask |= ((c1 + 3 * dcdy) >> 31) & (1 << 11); + mask |= ((c2 + 0 * dcdy) >> 31) & (1 << 4); + mask |= ((c2 + 1 * dcdy) >> 31) & (1 << 6); + mask |= ((c2 + 2 * dcdy) >> 31) & (1 << 12); + mask |= ((c2 + 3 * dcdy) >> 31) & (1 << 14); + mask |= ((c3 + 0 * dcdy) >> 31) & (1 << 5); + mask |= ((c3 + 1 * dcdy) >> 31) & (1 << 7); + mask |= ((c3 + 2 * dcdy) >> 31) & (1 << 13); + mask |= ((c3 + 3 * dcdy) >> 31) & (1 << 15); + + return mask; +} + +static INLINE unsigned +build_mask_linear(int c, int dcdx, int dcdy) +{ + int mask = 0; + + int c0 = c; + int c1 = c0 + dcdy; + int c2 = c1 + dcdy; + int c3 = c2 + dcdy; + + mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); + mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); + mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); + mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); + mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); + mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); + mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); + mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); + mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); + mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); + mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); + mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); + mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); + mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); + mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); + mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); + + return mask; +} + + #define TAG(x) x##_1 #define NR_PLANES 1 #include "lp_rast_tri_tmp.h" @@ -141,3 +157,85 @@ block_full_16(struct lp_rasterizer_task *task, #define NR_PLANES 7 #include "lp_rast_tri_tmp.h" + +/* Special case for 3 plane triangle which is contained entirely + * within a 16x16 block. + */ +void +lp_rast_triangle_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + const struct lp_rast_plane *plane = tri->plane; + unsigned mask = arg.triangle.plane_mask; + const int x = task->x + (mask & 0xf) * 16; + const int y = task->y + (mask >> 4) * 16; + unsigned outmask, inmask, partmask, partial_mask; + unsigned j; + int c[3]; + + outmask = 0; /* outside one or more trivial reject planes */ + partmask = 0; /* outside one or more trivial accept planes */ + + for (j = 0; j < 3; j++) { + c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; + + { + const int dcdx = -plane[j].dcdx * 4; + const int dcdy = plane[j].dcdy * 4; + const int cox = c[j] + plane[j].eo * 4; + const int cio = c[j] + plane[j].ei * 4 - 1; + + outmask |= build_mask_linear(cox, dcdx, dcdy); + partmask |= build_mask_linear(cio, dcdx, dcdy); + } + } + + if (outmask == 0xffff) + return; + + /* Mask of sub-blocks which are inside all trivial accept planes: + */ + inmask = ~partmask & 0xffff; + + /* Mask of sub-blocks which are inside all trivial reject planes, + * but outside at least one trivial accept plane: + */ + partial_mask = partmask & ~outmask; + + assert((partial_mask & inmask) == 0); + + /* Iterate over partials: + */ + while (partial_mask) { + int i = ffs(partial_mask) - 1; + int ix = (i & 3) * 4; + int iy = (i >> 2) * 4; + int px = x + ix; + int py = y + iy; + int cx[3]; + + partial_mask &= ~(1 << i); + + for (j = 0; j < 3; j++) + cx[j] = (c[j] + - plane[j].dcdx * ix + + plane[j].dcdy * iy); + + do_block_4_3(task, tri, plane, px, py, cx); + } + + /* Iterate over fulls: + */ + while (inmask) { + int i = ffs(inmask) - 1; + int ix = (i & 3) * 4; + int iy = (i >> 2) * 4; + int px = x + ix; + int py = y + iy; + + inmask &= ~(1 << i); + + block_full_4(task, tri, px, py); + } +} diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h index a410c611a3..43f72d8ca8 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -46,19 +46,13 @@ TAG(do_block_4)(struct lp_rasterizer_task *task, int x, int y, const int *c) { - unsigned mask = 0; - int i; + unsigned mask = 0xffff; + int j; - for (i = 0; i < 16; i++) { - int any_negative = 0; - int j; - - for (j = 0; j < NR_PLANES; j++) - any_negative |= (c[j] - 1 + plane[j].step[i]); - - any_negative >>= 31; - - mask |= (~any_negative) & (1 << i); + for (j = 0; j < NR_PLANES; j++) { + mask &= ~build_mask(c[j] - 1, + -plane[j].dcdx, + plane[j].dcdy); } /* Now pass to the shader: @@ -79,24 +73,19 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, const int *c) { unsigned outmask, inmask, partmask, partial_mask; - unsigned i, j; + unsigned j; outmask = 0; /* outside one or more trivial reject planes */ partmask = 0; /* outside one or more trivial accept planes */ for (j = 0; j < NR_PLANES; j++) { - const int *step = plane[j].step; - const int eo = plane[j].eo * 4; - const int ei = plane[j].ei * 4; - const int cox = c[j] + eo; - const int cio = ei - 1 - eo; - - for (i = 0; i < 16; i++) { - int out = cox + step[i] * 4; - int part = out + cio; - outmask |= (out >> 31) & (1 << i); - partmask |= (part >> 31) & (1 << i); - } + const int dcdx = -plane[j].dcdx * 4; + const int dcdy = plane[j].dcdy * 4; + const int cox = c[j] + plane[j].eo * 4; + const int cio = c[j] + plane[j].ei * 4 - 1; + + outmask |= build_mask_linear(cox, dcdx, dcdy); + partmask |= build_mask_linear(cio, dcdx, dcdy); } if (outmask == 0xffff) @@ -117,15 +106,19 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, */ while (partial_mask) { int i = ffs(partial_mask) - 1; - int px = x + pos_table4[i][0]; - int py = y + pos_table4[i][1]; + int ix = (i & 3) * 4; + int iy = (i >> 2) * 4; + int px = x + ix; + int py = y + iy; int cx[NR_PLANES]; - for (j = 0; j < NR_PLANES; j++) - cx[j] = c[j] + plane[j].step[i] * 4; - partial_mask &= ~(1 << i); + for (j = 0; j < NR_PLANES; j++) + cx[j] = (c[j] + - plane[j].dcdx * ix + + plane[j].dcdy * iy); + TAG(do_block_4)(task, tri, plane, px, py, cx); } @@ -133,8 +126,10 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, */ while (inmask) { int i = ffs(inmask) - 1; - int px = x + pos_table4[i][0]; - int py = y + pos_table4[i][1]; + int ix = (i & 3) * 4; + int iy = (i >> 2) * 4; + int px = x + ix; + int py = y + iy; inmask &= ~(1 << i); @@ -157,35 +152,28 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, struct lp_rast_plane plane[NR_PLANES]; int c[NR_PLANES]; unsigned outmask, inmask, partmask, partial_mask; - unsigned i, j, nr_planes = 0; + unsigned j = 0; + + outmask = 0; /* outside one or more trivial reject planes */ + partmask = 0; /* outside one or more trivial accept planes */ while (plane_mask) { int i = ffs(plane_mask) - 1; - plane[nr_planes] = tri->plane[i]; + plane[j] = tri->plane[i]; plane_mask &= ~(1 << i); - nr_planes++; - }; - - assert(nr_planes == NR_PLANES); - outmask = 0; /* outside one or more trivial reject planes */ - partmask = 0; /* outside one or more trivial accept planes */ + c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; - for (j = 0; j < NR_PLANES; j++) { - const int *step = plane[j].step; - const int eo = plane[j].eo * 16; - const int ei = plane[j].ei * 16; - int cox, cio; + { + const int dcdx = -plane[j].dcdx * 16; + const int dcdy = plane[j].dcdy * 16; + const int cox = c[j] + plane[j].eo * 16; + const int cio = c[j] + plane[j].ei * 16 - 1; - c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; - cox = c[j] + eo; - cio = ei - 1 - eo; - - for (i = 0; i < 16; i++) { - int out = cox + step[i] * 16; - int part = out + cio; - outmask |= (out >> 31) & (1 << i); - partmask |= (part >> 31) & (1 << i); + outmask |= build_mask_linear(cox, dcdx, dcdy); + partmask |= build_mask_linear(cio, dcdx, dcdy); } + + j++; } if (outmask == 0xffff) @@ -206,12 +194,16 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, */ while (partial_mask) { int i = ffs(partial_mask) - 1; - int px = x + pos_table16[i][0]; - int py = y + pos_table16[i][1]; + int ix = (i & 3) * 16; + int iy = (i >> 2) * 16; + int px = x + ix; + int py = y + iy; int cx[NR_PLANES]; for (j = 0; j < NR_PLANES; j++) - cx[j] = c[j] + plane[j].step[i] * 16; + cx[j] = (c[j] + - plane[j].dcdx * ix + + plane[j].dcdy * iy); partial_mask &= ~(1 << i); @@ -223,8 +215,10 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, */ while (inmask) { int i = ffs(inmask) - 1; - int px = x + pos_table16[i][0]; - int py = y + pos_table16[i][1]; + int ix = (i & 3) * 16; + int iy = (i >> 2) * 16; + int px = x + ix; + int py = y + iy; inmask &= ~(1 << i); diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 7e432503c1..614a6372b4 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -61,36 +61,6 @@ struct tri_info { -static const int step_scissor_minx[16] = { - 0, 1, 0, 1, - 2, 3, 2, 3, - 0, 1, 0, 1, - 2, 3, 2, 3 -}; - -static const int step_scissor_maxx[16] = { - 0, -1, 0, -1, - -2, -3, -2, -3, - 0, -1, 0, -1, - -2, -3, -2, -3 -}; - -static const int step_scissor_miny[16] = { - 0, 0, 1, 1, - 0, 0, 1, 1, - 2, 2, 3, 3, - 2, 2, 3, 3 -}; - -static const int step_scissor_maxy[16] = { - 0, 0, -1, -1, - 0, 0, -1, -1, - -2, -2, -3, -3, - -2, -2, -3, -3 -}; - - - static INLINE int subpixel_snap(float a) @@ -260,13 +230,13 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, { unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; + unsigned i; /* setup interpolation for all the remaining attributes: */ for (slot = 0; slot < setup->fs.nr_inputs; slot++) { unsigned vert_attr = setup->fs.input[slot].src_index; unsigned usage_mask = setup->fs.input[slot].usage_mask; - unsigned i; switch (setup->fs.input[slot].interp) { case LP_INTERP_CONSTANT: @@ -316,6 +286,34 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, /* The internal position input is in slot zero: */ setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask); + + if (0) { + for (i = 0; i < NUM_CHANNELS; i++) { + float a0 = tri->inputs.a0 [0][i]; + float dadx = tri->inputs.dadx[0][i]; + float dady = tri->inputs.dady[0][i]; + + debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n", + "xyzw"[i], + a0, dadx, dady); + } + + for (slot = 0; slot < setup->fs.nr_inputs; slot++) { + unsigned usage_mask = setup->fs.input[slot].usage_mask; + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) { + float a0 = tri->inputs.a0 [1 + slot][i]; + float dadx = tri->inputs.dadx[1 + slot][i]; + float dady = tri->inputs.dady[1 + slot][i]; + + debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n", + slot, + "xyzw"[i], + a0, dadx, dady); + } + } + } + } } @@ -525,7 +523,7 @@ do_triangle_ccw(struct lp_setup_context *setup, info.dx20 = info.v2[0][0] - info.v0[0][0]; info.dy01 = info.v0[0][1] - info.v1[0][1]; info.dy20 = info.v2[0][1] - info.v0[0][1]; - info.oneoverarea = 1.0 / (info.dx01 * info.dy20 - info.dx20 * info.dy01); + info.oneoverarea = 1.0f / (info.dx01 * info.dy20 - info.dx20 * info.dy01); info.frontfacing = frontfacing; /* Setup parameter interpolants: @@ -590,35 +588,6 @@ do_triangle_ccw(struct lp_setup_context *setup, /* Calculate trivial accept offsets from the above. */ plane->ei = plane->dcdy - plane->dcdx - plane->eo; - - plane->step = tri->step[i]; - - /* Fill in the inputs.step[][] arrays. - * We've manually unrolled some loops here. - */ -#define SETUP_STEP(j, x, y) \ - tri->step[i][j] = y * plane->dcdy - x * plane->dcdx - - SETUP_STEP(0, 0, 0); - SETUP_STEP(1, 1, 0); - SETUP_STEP(2, 0, 1); - SETUP_STEP(3, 1, 1); - - SETUP_STEP(4, 2, 0); - SETUP_STEP(5, 3, 0); - SETUP_STEP(6, 2, 1); - SETUP_STEP(7, 3, 1); - - SETUP_STEP(8, 0, 2); - SETUP_STEP(9, 1, 2); - SETUP_STEP(10, 0, 3); - SETUP_STEP(11, 1, 3); - - SETUP_STEP(12, 2, 2); - SETUP_STEP(13, 3, 2); - SETUP_STEP(14, 2, 3); - SETUP_STEP(15, 3, 3); -#undef STEP } @@ -641,28 +610,24 @@ do_triangle_ccw(struct lp_setup_context *setup, * these planes elsewhere. */ if (nr_planes == 7) { - tri->plane[3].step = step_scissor_minx; tri->plane[3].dcdx = -1; tri->plane[3].dcdy = 0; tri->plane[3].c = 1-minx; tri->plane[3].ei = 0; tri->plane[3].eo = 1; - tri->plane[4].step = step_scissor_maxx; tri->plane[4].dcdx = 1; tri->plane[4].dcdy = 0; tri->plane[4].c = maxx; tri->plane[4].ei = -1; tri->plane[4].eo = 0; - tri->plane[5].step = step_scissor_miny; tri->plane[5].dcdx = 0; tri->plane[5].dcdy = 1; tri->plane[5].c = 1-miny; tri->plane[5].ei = 0; tri->plane[5].eo = 1; - tri->plane[6].step = step_scissor_maxy; tri->plane[6].dcdx = 0; tri->plane[6].dcdy = -1; tri->plane[6].c = maxy; @@ -678,6 +643,26 @@ do_triangle_ccw(struct lp_setup_context *setup, /* Convert to tile coordinates, and inclusive ranges: */ + if (nr_planes == 3) { + int ix0 = minx / 16; + int iy0 = miny / 16; + int ix1 = (maxx-1) / 16; + int iy1 = (maxy-1) / 16; + + if (iy0 == iy1 && ix0 == ix1) + { + + /* Triangle is contained in a single 16x16 block: + */ + int mask = (ix0 & 3) | ((iy0 & 3) << 4); + + lp_scene_bin_command( scene, ix0/4, iy0/4, + lp_rast_triangle_3_16, + lp_rast_arg_triangle(tri, mask) ); + return; + } + } + ix0 = minx / TILE_SIZE; iy0 = miny / TILE_SIZE; ix1 = (maxx-1) / TILE_SIZE; diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py index c71ec8066c..2ba39052ab 100644 --- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py +++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py @@ -293,34 +293,7 @@ def generate_ssse3(): print ''' #if defined(PIPE_ARCH_SSE) - -#if defined(PIPE_ARCH_SSSE3) - -#include <tmmintrin.h> - -#else - -#include <emmintrin.h> - -/** - * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases - * where -mssse3 is not supported/enabled. - * - * MSVC will never get in here as its intrinsics support do not rely on - * compiler command line options. - */ -static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_shuffle_epi8(__m128i a, __m128i mask) -{ - __m128i result; - __asm__("pshufb %1, %0" - : "=x" (result) - : "xm" (mask), "0" (a)); - return result; -} - -#endif - +#include "util/u_sse.h" static void lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst, |