summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/llvmpipe
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/llvmpipe')
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_interp.c17
-rw-r--r--src/gallium/drivers/llvmpipe/lp_context.c2
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast.h9
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast_tri.c190
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h112
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup_tri.c115
-rw-r--r--src/gallium/drivers/llvmpipe/lp_tile_soa.py29
7 files changed, 261 insertions, 213 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 78744da500..2cf6f38c4b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -141,7 +141,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
else {
dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
- dadxy = LLVMBuildAdd(builder, dadx, dady, "");
+ dadxy = LLVMBuildFAdd(builder, dadx, dady, "");
attrib_name(dadx, attrib, chan, ".dadx");
attrib_name(dady, attrib, chan, ".dady");
attrib_name(dadxy, attrib, chan, ".dadxy");
@@ -177,7 +177,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
* dadq2 = 2 * dq
*/
- dadq2 = LLVMBuildAdd(builder, dadq, dadq, "");
+ dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
/*
* a = a0 + x * dadx + y * dady
@@ -193,12 +193,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
a = a0;
if (interp != LP_INTERP_CONSTANT &&
interp != LP_INTERP_FACING) {
- a = LLVMBuildAdd(builder, a,
- LLVMBuildMul(builder, bld->x, dadx, ""),
- "");
- a = LLVMBuildAdd(builder, a,
- LLVMBuildMul(builder, bld->y, dady, ""),
- "");
+ LLVMValueRef tmp;
+ tmp = LLVMBuildFMul(builder, bld->x, dadx, "");
+ a = LLVMBuildFAdd(builder, a, tmp, "");
+ tmp = LLVMBuildFMul(builder, bld->y, dady, "");
+ a = LLVMBuildFAdd(builder, a, tmp, "");
}
}
@@ -212,7 +211,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
* Compute the attrib values on the upper-left corner of each quad.
*/
- a = LLVMBuildAdd(builder, a, dadq2, "");
+ a = LLVMBuildFAdd(builder, a, dadq2, "");
/*
* a *= 1 / w
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 28793682ed..7543bd7b2b 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -47,7 +47,7 @@
#include "lp_setup.h"
-DEBUG_GET_ONCE_BOOL_OPTION(lp_no_rast, "LP_NO_RAST", FALSE);
+DEBUG_GET_ONCE_BOOL_OPTION(lp_no_rast, "LP_NO_RAST", FALSE)
static void llvmpipe_destroy( struct pipe_context *pipe )
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index eaf2a6f334..102e902d02 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -104,9 +104,6 @@ struct lp_rast_plane {
int dcdx;
int dcdy;
-
- /* edge/step info for 3 edges and 4x4 block of pixels */
- const int *step;
};
/**
@@ -119,8 +116,6 @@ struct lp_rast_triangle {
/* inputs for the shader */
struct lp_rast_shader_inputs inputs;
- int step[3][16];
-
#ifdef DEBUG
float v[3][2];
#endif
@@ -261,5 +256,9 @@ void lp_rast_begin_query(struct lp_rasterizer_task *,
void lp_rast_end_query(struct lp_rasterizer_task *,
const union lp_rast_cmd_arg );
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg);
+
#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index ebe9a8e92b..673f67386b 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -37,52 +37,6 @@
#include "lp_tile_soa.h"
-/**
- * Map an index in [0,15] to an x,y position, multiplied by 4.
- * This is used to get the position of each subtile in a 4x4
- * grid of edge step values.
- * Note: we can use some bit twiddling to compute these values instead
- * of using a look-up table, but there's no measurable performance
- * difference.
- */
-static const int pos_table4[16][2] = {
- { 0, 0 },
- { 4, 0 },
- { 0, 4 },
- { 4, 4 },
- { 8, 0 },
- { 12, 0 },
- { 8, 4 },
- { 12, 4 },
- { 0, 8 },
- { 4, 8 },
- { 0, 12 },
- { 4, 12 },
- { 8, 8 },
- { 12, 8 },
- { 8, 12 },
- { 12, 12 }
-};
-
-
-static const int pos_table16[16][2] = {
- { 0, 0 },
- { 16, 0 },
- { 0, 16 },
- { 16, 16 },
- { 32, 0 },
- { 48, 0 },
- { 32, 16 },
- { 48, 16 },
- { 0, 32 },
- { 16, 32 },
- { 0, 48 },
- { 16, 48 },
- { 32, 32 },
- { 48, 32 },
- { 32, 48 },
- { 48, 48 }
-};
/**
@@ -113,6 +67,68 @@ block_full_16(struct lp_rasterizer_task *task,
block_full_4(task, tri, x + ix, y + iy);
}
+
+static INLINE unsigned
+build_mask(int c, int dcdx, int dcdy)
+{
+ int mask = 0;
+
+ int c0 = c;
+ int c1 = c0 + dcdx;
+ int c2 = c1 + dcdx;
+ int c3 = c2 + dcdx;
+
+ mask |= ((c0 + 0 * dcdy) >> 31) & (1 << 0);
+ mask |= ((c0 + 1 * dcdy) >> 31) & (1 << 2);
+ mask |= ((c0 + 2 * dcdy) >> 31) & (1 << 8);
+ mask |= ((c0 + 3 * dcdy) >> 31) & (1 << 10);
+ mask |= ((c1 + 0 * dcdy) >> 31) & (1 << 1);
+ mask |= ((c1 + 1 * dcdy) >> 31) & (1 << 3);
+ mask |= ((c1 + 2 * dcdy) >> 31) & (1 << 9);
+ mask |= ((c1 + 3 * dcdy) >> 31) & (1 << 11);
+ mask |= ((c2 + 0 * dcdy) >> 31) & (1 << 4);
+ mask |= ((c2 + 1 * dcdy) >> 31) & (1 << 6);
+ mask |= ((c2 + 2 * dcdy) >> 31) & (1 << 12);
+ mask |= ((c2 + 3 * dcdy) >> 31) & (1 << 14);
+ mask |= ((c3 + 0 * dcdy) >> 31) & (1 << 5);
+ mask |= ((c3 + 1 * dcdy) >> 31) & (1 << 7);
+ mask |= ((c3 + 2 * dcdy) >> 31) & (1 << 13);
+ mask |= ((c3 + 3 * dcdy) >> 31) & (1 << 15);
+
+ return mask;
+}
+
+static INLINE unsigned
+build_mask_linear(int c, int dcdx, int dcdy)
+{
+ int mask = 0;
+
+ int c0 = c;
+ int c1 = c0 + dcdy;
+ int c2 = c1 + dcdy;
+ int c3 = c2 + dcdy;
+
+ mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
+ mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
+ mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
+ mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
+ mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
+ mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
+ mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
+ mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
+ mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
+ mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
+ mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
+ mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
+ mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
+ mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
+ mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
+ mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
+
+ return mask;
+}
+
+
#define TAG(x) x##_1
#define NR_PLANES 1
#include "lp_rast_tri_tmp.h"
@@ -141,3 +157,85 @@ block_full_16(struct lp_rasterizer_task *task,
#define NR_PLANES 7
#include "lp_rast_tri_tmp.h"
+
+/* Special case for 3 plane triangle which is contained entirely
+ * within a 16x16 block.
+ */
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ const struct lp_rast_triangle *tri = arg.triangle.tri;
+ const struct lp_rast_plane *plane = tri->plane;
+ unsigned mask = arg.triangle.plane_mask;
+ const int x = task->x + (mask & 0xf) * 16;
+ const int y = task->y + (mask >> 4) * 16;
+ unsigned outmask, inmask, partmask, partial_mask;
+ unsigned j;
+ int c[3];
+
+ outmask = 0; /* outside one or more trivial reject planes */
+ partmask = 0; /* outside one or more trivial accept planes */
+
+ for (j = 0; j < 3; j++) {
+ c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+
+ {
+ const int dcdx = -plane[j].dcdx * 4;
+ const int dcdy = plane[j].dcdy * 4;
+ const int cox = c[j] + plane[j].eo * 4;
+ const int cio = c[j] + plane[j].ei * 4 - 1;
+
+ outmask |= build_mask_linear(cox, dcdx, dcdy);
+ partmask |= build_mask_linear(cio, dcdx, dcdy);
+ }
+ }
+
+ if (outmask == 0xffff)
+ return;
+
+ /* Mask of sub-blocks which are inside all trivial accept planes:
+ */
+ inmask = ~partmask & 0xffff;
+
+ /* Mask of sub-blocks which are inside all trivial reject planes,
+ * but outside at least one trivial accept plane:
+ */
+ partial_mask = partmask & ~outmask;
+
+ assert((partial_mask & inmask) == 0);
+
+ /* Iterate over partials:
+ */
+ while (partial_mask) {
+ int i = ffs(partial_mask) - 1;
+ int ix = (i & 3) * 4;
+ int iy = (i >> 2) * 4;
+ int px = x + ix;
+ int py = y + iy;
+ int cx[3];
+
+ partial_mask &= ~(1 << i);
+
+ for (j = 0; j < 3; j++)
+ cx[j] = (c[j]
+ - plane[j].dcdx * ix
+ + plane[j].dcdy * iy);
+
+ do_block_4_3(task, tri, plane, px, py, cx);
+ }
+
+ /* Iterate over fulls:
+ */
+ while (inmask) {
+ int i = ffs(inmask) - 1;
+ int ix = (i & 3) * 4;
+ int iy = (i >> 2) * 4;
+ int px = x + ix;
+ int py = y + iy;
+
+ inmask &= ~(1 << i);
+
+ block_full_4(task, tri, px, py);
+ }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index a410c611a3..43f72d8ca8 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -46,19 +46,13 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
int x, int y,
const int *c)
{
- unsigned mask = 0;
- int i;
+ unsigned mask = 0xffff;
+ int j;
- for (i = 0; i < 16; i++) {
- int any_negative = 0;
- int j;
-
- for (j = 0; j < NR_PLANES; j++)
- any_negative |= (c[j] - 1 + plane[j].step[i]);
-
- any_negative >>= 31;
-
- mask |= (~any_negative) & (1 << i);
+ for (j = 0; j < NR_PLANES; j++) {
+ mask &= ~build_mask(c[j] - 1,
+ -plane[j].dcdx,
+ plane[j].dcdy);
}
/* Now pass to the shader:
@@ -79,24 +73,19 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
const int *c)
{
unsigned outmask, inmask, partmask, partial_mask;
- unsigned i, j;
+ unsigned j;
outmask = 0; /* outside one or more trivial reject planes */
partmask = 0; /* outside one or more trivial accept planes */
for (j = 0; j < NR_PLANES; j++) {
- const int *step = plane[j].step;
- const int eo = plane[j].eo * 4;
- const int ei = plane[j].ei * 4;
- const int cox = c[j] + eo;
- const int cio = ei - 1 - eo;
-
- for (i = 0; i < 16; i++) {
- int out = cox + step[i] * 4;
- int part = out + cio;
- outmask |= (out >> 31) & (1 << i);
- partmask |= (part >> 31) & (1 << i);
- }
+ const int dcdx = -plane[j].dcdx * 4;
+ const int dcdy = plane[j].dcdy * 4;
+ const int cox = c[j] + plane[j].eo * 4;
+ const int cio = c[j] + plane[j].ei * 4 - 1;
+
+ outmask |= build_mask_linear(cox, dcdx, dcdy);
+ partmask |= build_mask_linear(cio, dcdx, dcdy);
}
if (outmask == 0xffff)
@@ -117,15 +106,19 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
*/
while (partial_mask) {
int i = ffs(partial_mask) - 1;
- int px = x + pos_table4[i][0];
- int py = y + pos_table4[i][1];
+ int ix = (i & 3) * 4;
+ int iy = (i >> 2) * 4;
+ int px = x + ix;
+ int py = y + iy;
int cx[NR_PLANES];
- for (j = 0; j < NR_PLANES; j++)
- cx[j] = c[j] + plane[j].step[i] * 4;
-
partial_mask &= ~(1 << i);
+ for (j = 0; j < NR_PLANES; j++)
+ cx[j] = (c[j]
+ - plane[j].dcdx * ix
+ + plane[j].dcdy * iy);
+
TAG(do_block_4)(task, tri, plane, px, py, cx);
}
@@ -133,8 +126,10 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
*/
while (inmask) {
int i = ffs(inmask) - 1;
- int px = x + pos_table4[i][0];
- int py = y + pos_table4[i][1];
+ int ix = (i & 3) * 4;
+ int iy = (i >> 2) * 4;
+ int px = x + ix;
+ int py = y + iy;
inmask &= ~(1 << i);
@@ -157,35 +152,28 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
struct lp_rast_plane plane[NR_PLANES];
int c[NR_PLANES];
unsigned outmask, inmask, partmask, partial_mask;
- unsigned i, j, nr_planes = 0;
+ unsigned j = 0;
+
+ outmask = 0; /* outside one or more trivial reject planes */
+ partmask = 0; /* outside one or more trivial accept planes */
while (plane_mask) {
int i = ffs(plane_mask) - 1;
- plane[nr_planes] = tri->plane[i];
+ plane[j] = tri->plane[i];
plane_mask &= ~(1 << i);
- nr_planes++;
- };
-
- assert(nr_planes == NR_PLANES);
- outmask = 0; /* outside one or more trivial reject planes */
- partmask = 0; /* outside one or more trivial accept planes */
+ c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
- for (j = 0; j < NR_PLANES; j++) {
- const int *step = plane[j].step;
- const int eo = plane[j].eo * 16;
- const int ei = plane[j].ei * 16;
- int cox, cio;
+ {
+ const int dcdx = -plane[j].dcdx * 16;
+ const int dcdy = plane[j].dcdy * 16;
+ const int cox = c[j] + plane[j].eo * 16;
+ const int cio = c[j] + plane[j].ei * 16 - 1;
- c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
- cox = c[j] + eo;
- cio = ei - 1 - eo;
-
- for (i = 0; i < 16; i++) {
- int out = cox + step[i] * 16;
- int part = out + cio;
- outmask |= (out >> 31) & (1 << i);
- partmask |= (part >> 31) & (1 << i);
+ outmask |= build_mask_linear(cox, dcdx, dcdy);
+ partmask |= build_mask_linear(cio, dcdx, dcdy);
}
+
+ j++;
}
if (outmask == 0xffff)
@@ -206,12 +194,16 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
*/
while (partial_mask) {
int i = ffs(partial_mask) - 1;
- int px = x + pos_table16[i][0];
- int py = y + pos_table16[i][1];
+ int ix = (i & 3) * 16;
+ int iy = (i >> 2) * 16;
+ int px = x + ix;
+ int py = y + iy;
int cx[NR_PLANES];
for (j = 0; j < NR_PLANES; j++)
- cx[j] = c[j] + plane[j].step[i] * 16;
+ cx[j] = (c[j]
+ - plane[j].dcdx * ix
+ + plane[j].dcdy * iy);
partial_mask &= ~(1 << i);
@@ -223,8 +215,10 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
*/
while (inmask) {
int i = ffs(inmask) - 1;
- int px = x + pos_table16[i][0];
- int py = y + pos_table16[i][1];
+ int ix = (i & 3) * 16;
+ int iy = (i >> 2) * 16;
+ int px = x + ix;
+ int py = y + iy;
inmask &= ~(1 << i);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 7e432503c1..614a6372b4 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -61,36 +61,6 @@ struct tri_info {
-static const int step_scissor_minx[16] = {
- 0, 1, 0, 1,
- 2, 3, 2, 3,
- 0, 1, 0, 1,
- 2, 3, 2, 3
-};
-
-static const int step_scissor_maxx[16] = {
- 0, -1, 0, -1,
- -2, -3, -2, -3,
- 0, -1, 0, -1,
- -2, -3, -2, -3
-};
-
-static const int step_scissor_miny[16] = {
- 0, 0, 1, 1,
- 0, 0, 1, 1,
- 2, 2, 3, 3,
- 2, 2, 3, 3
-};
-
-static const int step_scissor_maxy[16] = {
- 0, 0, -1, -1,
- 0, 0, -1, -1,
- -2, -2, -3, -3,
- -2, -2, -3, -3
-};
-
-
-
static INLINE int
subpixel_snap(float a)
@@ -260,13 +230,13 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,
{
unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
unsigned slot;
+ unsigned i;
/* setup interpolation for all the remaining attributes:
*/
for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
unsigned vert_attr = setup->fs.input[slot].src_index;
unsigned usage_mask = setup->fs.input[slot].usage_mask;
- unsigned i;
switch (setup->fs.input[slot].interp) {
case LP_INTERP_CONSTANT:
@@ -316,6 +286,34 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,
/* The internal position input is in slot zero:
*/
setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask);
+
+ if (0) {
+ for (i = 0; i < NUM_CHANNELS; i++) {
+ float a0 = tri->inputs.a0 [0][i];
+ float dadx = tri->inputs.dadx[0][i];
+ float dady = tri->inputs.dady[0][i];
+
+ debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n",
+ "xyzw"[i],
+ a0, dadx, dady);
+ }
+
+ for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+ unsigned usage_mask = setup->fs.input[slot].usage_mask;
+ for (i = 0; i < NUM_CHANNELS; i++) {
+ if (usage_mask & (1 << i)) {
+ float a0 = tri->inputs.a0 [1 + slot][i];
+ float dadx = tri->inputs.dadx[1 + slot][i];
+ float dady = tri->inputs.dady[1 + slot][i];
+
+ debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n",
+ slot,
+ "xyzw"[i],
+ a0, dadx, dady);
+ }
+ }
+ }
+ }
}
@@ -525,7 +523,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
info.dx20 = info.v2[0][0] - info.v0[0][0];
info.dy01 = info.v0[0][1] - info.v1[0][1];
info.dy20 = info.v2[0][1] - info.v0[0][1];
- info.oneoverarea = 1.0 / (info.dx01 * info.dy20 - info.dx20 * info.dy01);
+ info.oneoverarea = 1.0f / (info.dx01 * info.dy20 - info.dx20 * info.dy01);
info.frontfacing = frontfacing;
/* Setup parameter interpolants:
@@ -590,35 +588,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
/* Calculate trivial accept offsets from the above.
*/
plane->ei = plane->dcdy - plane->dcdx - plane->eo;
-
- plane->step = tri->step[i];
-
- /* Fill in the inputs.step[][] arrays.
- * We've manually unrolled some loops here.
- */
-#define SETUP_STEP(j, x, y) \
- tri->step[i][j] = y * plane->dcdy - x * plane->dcdx
-
- SETUP_STEP(0, 0, 0);
- SETUP_STEP(1, 1, 0);
- SETUP_STEP(2, 0, 1);
- SETUP_STEP(3, 1, 1);
-
- SETUP_STEP(4, 2, 0);
- SETUP_STEP(5, 3, 0);
- SETUP_STEP(6, 2, 1);
- SETUP_STEP(7, 3, 1);
-
- SETUP_STEP(8, 0, 2);
- SETUP_STEP(9, 1, 2);
- SETUP_STEP(10, 0, 3);
- SETUP_STEP(11, 1, 3);
-
- SETUP_STEP(12, 2, 2);
- SETUP_STEP(13, 3, 2);
- SETUP_STEP(14, 2, 3);
- SETUP_STEP(15, 3, 3);
-#undef STEP
}
@@ -641,28 +610,24 @@ do_triangle_ccw(struct lp_setup_context *setup,
* these planes elsewhere.
*/
if (nr_planes == 7) {
- tri->plane[3].step = step_scissor_minx;
tri->plane[3].dcdx = -1;
tri->plane[3].dcdy = 0;
tri->plane[3].c = 1-minx;
tri->plane[3].ei = 0;
tri->plane[3].eo = 1;
- tri->plane[4].step = step_scissor_maxx;
tri->plane[4].dcdx = 1;
tri->plane[4].dcdy = 0;
tri->plane[4].c = maxx;
tri->plane[4].ei = -1;
tri->plane[4].eo = 0;
- tri->plane[5].step = step_scissor_miny;
tri->plane[5].dcdx = 0;
tri->plane[5].dcdy = 1;
tri->plane[5].c = 1-miny;
tri->plane[5].ei = 0;
tri->plane[5].eo = 1;
- tri->plane[6].step = step_scissor_maxy;
tri->plane[6].dcdx = 0;
tri->plane[6].dcdy = -1;
tri->plane[6].c = maxy;
@@ -678,6 +643,26 @@ do_triangle_ccw(struct lp_setup_context *setup,
/* Convert to tile coordinates, and inclusive ranges:
*/
+ if (nr_planes == 3) {
+ int ix0 = minx / 16;
+ int iy0 = miny / 16;
+ int ix1 = (maxx-1) / 16;
+ int iy1 = (maxy-1) / 16;
+
+ if (iy0 == iy1 && ix0 == ix1)
+ {
+
+ /* Triangle is contained in a single 16x16 block:
+ */
+ int mask = (ix0 & 3) | ((iy0 & 3) << 4);
+
+ lp_scene_bin_command( scene, ix0/4, iy0/4,
+ lp_rast_triangle_3_16,
+ lp_rast_arg_triangle(tri, mask) );
+ return;
+ }
+ }
+
ix0 = minx / TILE_SIZE;
iy0 = miny / TILE_SIZE;
ix1 = (maxx-1) / TILE_SIZE;
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index c71ec8066c..2ba39052ab 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -293,34 +293,7 @@ def generate_ssse3():
print '''
#if defined(PIPE_ARCH_SSE)
-
-#if defined(PIPE_ARCH_SSSE3)
-
-#include <tmmintrin.h>
-
-#else
-
-#include <emmintrin.h>
-
-/**
- * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases
- * where -mssse3 is not supported/enabled.
- *
- * MSVC will never get in here as its intrinsics support do not rely on
- * compiler command line options.
- */
-static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_epi8(__m128i a, __m128i mask)
-{
- __m128i result;
- __asm__("pshufb %1, %0"
- : "=x" (result)
- : "xm" (mask), "0" (a));
- return result;
-}
-
-#endif
-
+#include "util/u_sse.h"
static void
lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,