diff options
| author | Keith Whitwell <keithw@vmware.com> | 2010-10-11 16:30:14 +0100 | 
|---|---|---|
| committer | Keith Whitwell <keithw@vmware.com> | 2010-10-12 11:50:07 +0100 | 
| commit | 2cf98d5a6dccba3fd69b8469e67f66dfb5fc9651 (patch) | |
| tree | 1165147c536688689eaf994975cdba5836a3b63a /src/gallium/drivers | |
| parent | 4cb3b4ced80891ce8760cf5a0c06db9dbee36b76 (diff) | |
llvmpipe: try to do more of rast_tri_3_16 with intrinsics
There was actually a large quantity of scalar code in these functions
previously.  This tries to move more into intrinsics.
Introduce an sse2 mm_mullo_epi32 replacement to avoid sse4 dependency
in the new rasterization code.
Diffstat (limited to 'src/gallium/drivers')
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast.h | 16 | ||||
| -rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_tri.c | 264 | 
2 files changed, 271 insertions, 9 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index df0bea04b9..e2bcc45016 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -89,19 +89,21 @@ struct lp_rast_shader_inputs {     const struct lp_rast_state *state;  }; - +/* Note: the order of these values is important as they are loaded by + * sse code in rasterization: + */  struct lp_rast_plane { -   /* one-pixel sized trivial accept offsets for each plane */ -   int ei; - -   /* one-pixel sized trivial reject offsets for each plane */ -   int eo; -     /* edge function values at minx,miny ?? */     int c;     int dcdx;     int dcdy; + +   /* one-pixel sized trivial reject offsets for each plane */ +   int eo; + +   /* one-pixel sized trivial accept offsets for each plane */ +   int ei;  };  /** diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index f870a187db..7a6cbb8b63 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -32,6 +32,7 @@  #include <limits.h>  #include "util/u_math.h"  #include "lp_debug.h" +#include "lp_debug_intrin.h"  #include "lp_perf.h"  #include "lp_rast_priv.h"  #include "lp_tile_soa.h" @@ -254,8 +255,8 @@ sign_bits4(const __m128i *cstep, int cdiff)  #define TAG(x) x##_3  #define NR_PLANES 3 -#define TRI_4 lp_rast_triangle_3_4 -#define TRI_16 lp_rast_triangle_3_16 +/*#define TRI_4 lp_rast_triangle_3_4*/ +/*#define TRI_16 lp_rast_triangle_3_16*/  #include "lp_rast_tri_tmp.h"  #define TAG(x) x##_4 @@ -279,3 +280,262 @@ sign_bits4(const __m128i *cstep, int cdiff)  #define NR_PLANES 8  #include "lp_rast_tri_tmp.h" + +static INLINE void +transpose4_epi32(__m128i a, +                 __m128i b, +                 __m128i c, +                 __m128i d, +                 __m128i *o, +                 __m128i *p, +                 __m128i *q, +                 __m128i *r) +{ +  __m128i t0 = _mm_unpacklo_epi32(a, b); +  __m128i t1 = _mm_unpacklo_epi32(c, d); +  __m128i t2 = _mm_unpackhi_epi32(a, b); +  __m128i t3 = _mm_unpackhi_epi32(c, d); + +  *o = _mm_unpacklo_epi64(t0, t1); +  *p = _mm_unpackhi_epi64(t0, t1); +  *q = _mm_unpacklo_epi64(t2, t3); +  *r = _mm_unpackhi_epi64(t2, t3); +} + + +#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i)) + +#define NR_PLANES 3 + + + +/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of + * _mm_mul_epu32(). + * + * I suspect this works fine for us because one of our operands is + * always positive, but not sure that this can be used for general + * signed integer multiplication. + * + * This seems close enough to the speed of SSE4 and the real + * _mm_mullo_epi32() intrinsic as to not justify adding an sse4 + * dependency at this point. + */ +static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b) +{ +   __m128i a4   = _mm_srli_si128(a, 4);  /* shift by one dword */ +   __m128i b4   = _mm_srli_si128(b, 4);  /* shift by one dword */ +   __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */ +   __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */ + +   /* Interleave the results, either with shuffles or (slightly +    * faster) direct bit operations: +    */ +#if 0 +   __m128i ba8             = _mm_shuffle_epi32(ba, 8); +   __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8); +   __m128i result          = _mm_unpacklo_epi32(ba8, b4a48); +#else +   __m128i mask            = _mm_setr_epi32(~0,0,~0,0); +   __m128i ba_mask         = _mm_and_si128(ba, mask); +   __m128i b4a4_mask       = _mm_and_si128(b4a4, mask); +   __m128i b4a4_mask_shift = _mm_slli_si128(b4a4_mask, 4); +   __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift); +#endif + +   return result; +} + + + + +void +lp_rast_triangle_3_16(struct lp_rasterizer_task *task, +                      const union lp_rast_cmd_arg arg) +{ +   const struct lp_rast_triangle *tri = arg.triangle.tri; +   const struct lp_rast_plane *plane = tri->plane; +   int x = (arg.triangle.plane_mask & 0xff) + task->x; +   int y = (arg.triangle.plane_mask >> 8) + task->y; +   unsigned i, j; + +   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; +   unsigned nr = 0; + +   __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ +   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ +   __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ +   __m128i zero = _mm_setzero_si128(); + +   __m128i c; +   __m128i dcdx; +   __m128i dcdy; +   __m128i rej4; + +   __m128i dcdx2; +   __m128i dcdx3; +    +   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */ +   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */ +   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */ +   __m128i unused; +    +   transpose4_epi32(p0, p1, p2, zero, +                   &c, &dcdx, &dcdy, &rej4); + +   /* Adjust dcdx; +    */ +   dcdx = _mm_sub_epi32(zero, dcdx); + +   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); +   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); +   rej4 = _mm_slli_epi32(rej4, 2); + +   dcdx2 = _mm_add_epi32(dcdx, dcdx); +   dcdx3 = _mm_add_epi32(dcdx2, dcdx); + +   transpose4_epi32(zero, dcdx, dcdx2, dcdx3, +                   &span_0, &span_1, &span_2, &unused); + +   for (i = 0; i < 4; i++) { +      __m128i cx = c; + +      for (j = 0; j < 4; j++) { +         __m128i c4rej = _mm_add_epi32(cx, rej4); +         __m128i rej_masks = _mm_srai_epi32(c4rej, 31); + +         /* if (is_zero(rej_masks)) */ +         if (_mm_movemask_epi8(rej_masks) == 0) { +            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); +            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); +            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); + +            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); + +            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); +            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); +            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); + +            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); +            __m128i c_01 = _mm_packs_epi32(c_0, c_1); + +            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); +            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); +            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); + +            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); + +            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); +            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); +            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); + +            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); +            __m128i c_23 = _mm_packs_epi32(c_2, c_3); +            __m128i c_0123 = _mm_packs_epi16(c_01, c_23); + +            unsigned mask = _mm_movemask_epi8(c_0123); + +            out[nr].i = i; +            out[nr].j = j; +            out[nr].mask = mask; +            if (mask != 0xffff) +               nr++; +         } +         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); +      } + +      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); +   } + +   for (i = 0; i < nr; i++) +      lp_rast_shade_quads_mask(task, +                               &tri->inputs, +                               x + 4 * out[i].j, +                               y + 4 * out[i].i, +                               0xffff & ~out[i].mask); +} + + + + + +void +lp_rast_triangle_3_4(struct lp_rasterizer_task *task, +                     const union lp_rast_cmd_arg arg) +{ +   const struct lp_rast_triangle *tri = arg.triangle.tri; +   const struct lp_rast_plane *plane = tri->plane; +   int x = (arg.triangle.plane_mask & 0xff) + task->x; +   int y = (arg.triangle.plane_mask >> 8) + task->y; + +   __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ +   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ +   __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ +   __m128i zero = _mm_setzero_si128(); + +   __m128i c; +   __m128i dcdx; +   __m128i dcdy; + +   __m128i dcdx2; +   __m128i dcdx3; +    +   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */ +   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */ +   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */ +   __m128i unused; +    +   transpose4_epi32(p0, p1, p2, zero, +                    &c, &dcdx, &dcdy, &unused); + +   /* Adjust dcdx; +    */ +   dcdx = _mm_sub_epi32(zero, dcdx); + +   c = _mm_add_epi32(c, _mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); +   c = _mm_add_epi32(c, _mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); + +   dcdx2 = _mm_add_epi32(dcdx, dcdx); +   dcdx3 = _mm_add_epi32(dcdx2, dcdx); + +   transpose4_epi32(zero, dcdx, dcdx2, dcdx3, +                    &span_0, &span_1, &span_2, &unused); + + +   { +      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); +      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); +      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); +       +      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); + +      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); +      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); +      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); + +      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); +      __m128i c_01 = _mm_packs_epi32(c_0, c_1); + +      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); +      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); +      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); + +      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); + +      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); +      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); +      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); + +      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); +      __m128i c_23 = _mm_packs_epi32(c_2, c_3); +      __m128i c_0123 = _mm_packs_epi16(c_01, c_23); + +      unsigned mask = _mm_movemask_epi8(c_0123); + +      if (mask != 0xffff) +         lp_rast_shade_quads_mask(task, +                                  &tri->inputs, +                                  x, +                                  y, +                                  0xffff & ~mask); +   } +}  | 
