diff options
Diffstat (limited to 'src/gallium/drivers/llvmpipe/lp_setup_tri.c')
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_setup_tri.c | 755 |
1 files changed, 755 insertions, 0 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c new file mode 100644 index 0000000000..a09e0fa643 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -0,0 +1,755 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Recursive rasterization for triangles + */ + +#include "lp_context.h" +#include "lp_quad.h" +#include "lp_quad_pipe.h" +#include "lp_setup.h" +#include "lp_state.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_vertex.h" +#include "pipe/p_shader_tokens.h" +#include "pipe/p_thread.h" +#include "util/u_math.h" +#include "util/u_memory.h" + +#define BLOCKSIZE 4 + +struct triangle { + /* one-pixel sized trivial accept offsets for each plane */ + float ei1; + float ei2; + float ei3; + + /* one-pixel sized trivial reject offsets for each plane */ + float eo1; + float eo2; + float eo3; + + /* y deltas for vertex pairs */ + float dy12; + float dy23; + float dy31; + + /* x deltas for vertex pairs */ + float dx12; + float dx23; + float dx31; + + /* Attribute interpolation: + */ + float oneoverarea; + float x1; + float y1; + struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS]; + struct tgsi_interp_coef position_coef; + + /* A run of pre-initialized quads: + */ + struct llvmpipe_context *llvmpipe; + struct quad_header quad[4]; +}; + + +/** + * Compute a0 for a constant-valued coefficient (GL_FLAT shading). + */ +static void constant_coef( struct tgsi_interp_coef *coef, + const float (*v3)[4], + unsigned vert_attr, + unsigned i ) +{ + coef->a0[i] = v3[vert_attr][i]; + coef->dadx[i] = 0; + coef->dady[i] = 0; +} + +/** + * Compute a0, dadx and dady for a linearly interpolated coefficient, + * for a triangle. + */ +static void linear_coef( struct triangle *tri, + struct tgsi_interp_coef *coef, + const float (*v1)[4], + const float (*v2)[4], + const float (*v3)[4], + unsigned vert_attr, + unsigned i) +{ + float a1 = v1[vert_attr][i]; + float a2 = v2[vert_attr][i]; + float a3 = v3[vert_attr][i]; + + float da12 = a1 - a2; + float da31 = a3 - a1; + float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * tri->oneoverarea; + float dady = (da31 * tri->dx12 - tri->dx31 * da12) * tri->oneoverarea; + + coef->dadx[i] = dadx; + coef->dady[i] = dady; + + /* calculate a0 as the value which would be sampled for the + * fragment at (0,0), taking into account that we want to sample at + * pixel centers, in other words (0.5, 0.5). + * + * this is neat but unfortunately not a good way to do things for + * triangles with very large values of dadx or dady as it will + * result in the subtraction and re-addition from a0 of a very + * large number, which means we'll end up loosing a lot of the + * fractional bits and precision from a0. the way to fix this is + * to define a0 as the sample at a pixel center somewhere near vmin + * instead - i'll switch to this later. + */ + coef->a0[i] = (v1[vert_attr][i] - + (dadx * (v1[0][0] - 0.5f) + + dady * (v1[0][1] - 0.5f))); +} + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void perspective_coef( struct triangle *tri, + struct tgsi_interp_coef *coef, + const float (*v1)[4], + const float (*v2)[4], + const float (*v3)[4], + unsigned vert_attr, + unsigned i) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + float a1 = v1[vert_attr][i] * v1[0][3]; + float a2 = v2[vert_attr][i] * v2[0][3]; + float a3 = v3[vert_attr][i] * v3[0][3]; + float da12 = a1 - a2; + float da31 = a3 - a1; + float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * tri->oneoverarea; + float dady = (da31 * tri->dx12 - tri->dx31 * da12) * tri->oneoverarea; + + + coef->dadx[i] = dadx; + coef->dady[i] = dady; + coef->a0[i] = (a1 - + (dadx * (v1[0][0] - 0.5f) + + dady * (v1[0][1] - 0.5f))); +} + + +/** + * Special coefficient setup for gl_FragCoord. + * X and Y are trivial, though Y has to be inverted for OpenGL. + * Z and W are copied from position_coef which should have already been computed. + * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. + */ +static void +setup_fragcoord_coef(struct triangle *tri, unsigned slot) +{ + /*X*/ + tri->coef[slot].a0[0] = 0.0; + tri->coef[slot].dadx[0] = 1.0; + tri->coef[slot].dady[0] = 0.0; + /*Y*/ + tri->coef[slot].a0[1] = 0.0; + tri->coef[slot].dadx[1] = 0.0; + tri->coef[slot].dady[1] = 1.0; + /*Z*/ + tri->coef[slot].a0[2] = tri->position_coef.a0[2]; + tri->coef[slot].dadx[2] = tri->position_coef.dadx[2]; + tri->coef[slot].dady[2] = tri->position_coef.dady[2]; + /*W*/ + tri->coef[slot].a0[3] = tri->position_coef.a0[3]; + tri->coef[slot].dadx[3] = tri->position_coef.dadx[3]; + tri->coef[slot].dady[3] = tri->position_coef.dady[3]; +} + + + +/** + * Compute the tri->coef[] array dadx, dady, a0 values. + */ +static void setup_tri_coefficients( struct llvmpipe_context *llvmpipe, + struct triangle *tri, + const float (*v1)[4], + const float (*v2)[4], + const float (*v3)[4], + boolean frontface ) +{ + const struct lp_fragment_shader *fs = llvmpipe->fs; + const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe); + unsigned input; + + /* z and w are done by linear interpolation: + */ + linear_coef(tri, &tri->position_coef, v1, v2, v3, 0, 2); + linear_coef(tri, &tri->position_coef, v1, v2, v3, 0, 3); + + /* setup interpolation for all the remaining attributes: + */ + for (input = 0; input < fs->info.num_inputs; input++) { + unsigned vert_attr = vinfo->attrib[input].src_index; + unsigned i; + + switch (vinfo->attrib[input].interp_mode) { + case INTERP_CONSTANT: + for (i = 0; i < NUM_CHANNELS; i++) + constant_coef(&tri->coef[input], v3, vert_attr, i); + break; + + case INTERP_LINEAR: + for (i = 0; i < NUM_CHANNELS; i++) + linear_coef(tri, &tri->coef[input], v1, v2, v3, vert_attr, i); + break; + + case INTERP_PERSPECTIVE: + for (i = 0; i < NUM_CHANNELS; i++) + perspective_coef(tri, &tri->coef[input], v1, v2, v3, vert_attr, i); + break; + + case INTERP_POS: + setup_fragcoord_coef(tri, input); + break; + + default: + assert(0); + } + + if (fs->info.input_semantic_name[input] == TGSI_SEMANTIC_FACE) { + tri->coef[input].a0[0] = 1.0f - frontface; + tri->coef[input].dadx[0] = 0.0; + tri->coef[input].dady[0] = 0.0; + } + } +} + + + +/* XXX: do this by add/subtracting a large floating point number: + */ +static inline float subpixel_snap( float a ) +{ + int i = a * 16; + return (float)i * (1.0/16); +} + + +/* Convert 8x8 block into four runs of quads and render each in turn. + */ +#if (BLOCKSIZE == 8) +static void block_full( struct triangle *tri, int x, int y ) +{ + struct quad_header *ptrs[4]; + int i; + + tri->quad[0].input.x0 = x + 0; + tri->quad[1].input.x0 = x + 2; + tri->quad[2].input.x0 = x + 4; + tri->quad[3].input.x0 = x + 6; + + for (i = 0; i < 4; i++, y += 2) { + tri->quad[0].inout.mask = 0xf; + tri->quad[1].inout.mask = 0xf; + tri->quad[2].inout.mask = 0xf; + tri->quad[3].inout.mask = 0xf; + + tri->quad[0].input.y0 = y; + tri->quad[1].input.y0 = y; + tri->quad[2].input.y0 = y; + tri->quad[3].input.y0 = y; + + /* XXX: don't bother with this ptrs business */ + ptrs[0] = &tri->quad[0]; + ptrs[1] = &tri->quad[1]; + ptrs[2] = &tri->quad[2]; + ptrs[3] = &tri->quad[3]; + + tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, ptrs, 4 ); + } +} +#elif (BLOCKSIZE == 4) +static void block_full( struct triangle *tri, int x, int y ) +{ + struct quad_header *ptrs[4]; + int iy; + + tri->quad[0].input.x0 = x + 0; + tri->quad[1].input.x0 = x + 2; + + for (iy = 0; iy < 4; iy += 2) { + tri->quad[0].inout.mask = 0xf; + tri->quad[1].inout.mask = 0xf; + + tri->quad[0].input.y0 = y + iy; + tri->quad[1].input.y0 = y + iy; + + /* XXX: don't bother with this ptrs business */ + ptrs[0] = &tri->quad[0]; + ptrs[1] = &tri->quad[1]; + + tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, ptrs, 2 ); + } +} +#else +static void block_full( struct triangle *tri, int x, int y ) +{ + struct quad_header *ptrs[4]; + int iy; + + tri->quad[0].input.x0 = x; + tri->quad[0].input.y0 = y; + tri->quad[0].inout.mask = 0xf; + + ptrs[0] = &tri->quad[0]; + tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, ptrs, 1 ); +} +#endif + + +static void +do_quad( struct triangle *tri, + int x, int y, + float c1, float c2, float c3 ) +{ + struct quad_header *quad = &tri->quad[0]; + + float xstep1 = -tri->dy12; + float xstep2 = -tri->dy23; + float xstep3 = -tri->dy31; + + float ystep1 = tri->dx12; + float ystep2 = tri->dx23; + float ystep3 = tri->dx31; + + quad->input.x0 = x; + quad->input.y0 = y; + quad->inout.mask = 0; + + if (c1 > 0 && + c2 > 0 && + c3 > 0) + quad->inout.mask |= 1; + + if (c1 + xstep1 > 0 && + c2 + xstep2 > 0 && + c3 + xstep3 > 0) + quad->inout.mask |= 2; + + if (c1 + ystep1 > 0 && + c2 + ystep2 > 0 && + c3 + ystep3 > 0) + quad->inout.mask |= 4; + + if (c1 + ystep1 + xstep1 > 0 && + c2 + ystep2 + xstep2 > 0 && + c3 + ystep3 + xstep3 > 0) + quad->inout.mask |= 8; + + if (quad->inout.mask) + tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, &quad, 1 ); +} + +/* Evaluate each pixel in a block, generate a mask and possibly render + * the quad: + */ +static void +do_block( struct triangle *tri, + int x, int y, + float c1, + float c2, + float c3 ) +{ + const int step = 2; + + float xstep1 = -step * tri->dy12; + float xstep2 = -step * tri->dy23; + float xstep3 = -step * tri->dy31; + + float ystep1 = step * tri->dx12; + float ystep2 = step * tri->dx23; + float ystep3 = step * tri->dx31; + + int ix, iy; + + for (iy = 0; iy < BLOCKSIZE; iy += 2) { + float cx1 = c1; + float cx2 = c2; + float cx3 = c3; + + for (ix = 0; ix < BLOCKSIZE; ix += 2) { + + do_quad(tri, x+ix, y+iy, cx1, cx2, cx3); + + cx1 += xstep1; + cx2 += xstep2; + cx3 += xstep3; + } + + c1 += ystep1; + c2 += ystep2; + c3 += ystep3; + } +} + + + + +/* to avoid having to allocate power-of-four, square render targets, + * end up having a specialized version of the above that runs only at + * the topmost level. + * + * at the topmost level there may be an arbitary number of steps on + * either dimension, so this loop needs to be either separately + * code-generated and unrolled for each render target size, or kept as + * generic looping code: + */ + +#define MIN3(a,b,c) MIN2(MIN2(a,b),c) +#define MAX3(a,b,c) MAX2(MAX2(a,b),c) + +static void +do_triangle_ccw(struct llvmpipe_context *llvmpipe, + const float (*v1)[4], + const float (*v2)[4], + const float (*v3)[4], + boolean frontfacing ) +{ + const int rt_width = llvmpipe->framebuffer.cbufs[0]->width; + const int rt_height = llvmpipe->framebuffer.cbufs[0]->height; + + const float y1 = subpixel_snap(v1[0][1]); + const float y2 = subpixel_snap(v2[0][1]); + const float y3 = subpixel_snap(v3[0][1]); + + const float x1 = subpixel_snap(v1[0][0]); + const float x2 = subpixel_snap(v2[0][0]); + const float x3 = subpixel_snap(v3[0][0]); + + struct triangle tri; + float area; + float c1, c2, c3; + int i; + int minx, maxx, miny, maxy; + + tri.llvmpipe = llvmpipe; + + + tri.dx12 = x1 - x2; + tri.dx23 = x2 - x3; + tri.dx31 = x3 - x1; + + tri.dy12 = y1 - y2; + tri.dy23 = y2 - y3; + tri.dy31 = y3 - y1; + + area = (tri.dx12 * tri.dy31 - + tri.dx31 * tri.dy12); + + /* Cull non-ccw and zero-sized triangles. + */ + if (area <= 0 || util_is_inf_or_nan(area)) + return; + + // Bounding rectangle + minx = util_iround(MIN3(x1, x2, x3) - .5); + maxx = util_iround(MAX3(x1, x2, x3) + .5); + miny = util_iround(MIN3(y1, y2, y3) - .5); + maxy = util_iround(MAX3(y1, y2, y3) + .5); + + /* Clamp to framebuffer (or tile) dimensions: + */ + miny = MAX2(0, miny); + minx = MAX2(0, minx); + maxy = MIN2(rt_height, maxy); + maxx = MIN2(rt_width, maxx); + + if (miny == maxy || minx == maxx) + return; + + /* The only divide in this code. Is it really needed? + */ + tri.oneoverarea = 1.0f / area; + + /* Setup parameter interpolants: + */ + setup_tri_coefficients( llvmpipe, &tri, v1, v2, v3, frontfacing ); + + for (i = 0; i < Elements(tri.quad); i++) { + tri.quad[i].coef = tri.coef; + tri.quad[i].posCoef = &tri.position_coef; + } + + /* half-edge constants, will be interated over the whole + * rendertarget. + */ + c1 = tri.dy12 * x1 - tri.dx12 * y1; + c2 = tri.dy23 * x2 - tri.dx23 * y2; + c3 = tri.dy31 * x3 - tri.dx31 * y3; + + /* correct for top-left fill convention: + */ + if (tri.dy12 < 0 || (tri.dy12 == 0 && tri.dx12 > 0)) c1++; + if (tri.dy23 < 0 || (tri.dy23 == 0 && tri.dx23 > 0)) c2++; + if (tri.dy31 < 0 || (tri.dy31 == 0 && tri.dx31 > 0)) c3++; + + /* find trivial reject offsets for each edge for a single-pixel + * sized block. These will be scaled up at each recursive level to + * match the active blocksize. Scaling in this way works best if + * the blocks are square. + */ + tri.eo1 = 0; + if (tri.dy12 < 0) tri.eo1 -= tri.dy12; + if (tri.dx12 > 0) tri.eo1 += tri.dx12; + + tri.eo2 = 0; + if (tri.dy23 < 0) tri.eo2 -= tri.dy23; + if (tri.dx23 > 0) tri.eo2 += tri.dx23; + + tri.eo3 = 0; + if (tri.dy31 < 0) tri.eo3 -= tri.dy31; + if (tri.dx31 > 0) tri.eo3 += tri.dx31; + + /* Calculate trivial accept offsets from the above. + */ + tri.ei1 = tri.dx12 - tri.dy12 - tri.eo1; + tri.ei2 = tri.dx23 - tri.dy23 - tri.eo2; + tri.ei3 = tri.dx31 - tri.dy31 - tri.eo3; + + minx &= ~(BLOCKSIZE-1); /* aligned blocks */ + miny &= ~(BLOCKSIZE-1); /* aligned blocks */ + + c1 += tri.dx12 * miny - tri.dy12 * minx; + c2 += tri.dx23 * miny - tri.dy23 * minx; + c3 += tri.dx31 * miny - tri.dy31 * minx; + + if ((miny & ~15) == (maxy & ~15) && + (minx & ~15) == (maxx & ~15)) + { + const int step = 2; + + float xstep1 = -step * tri.dy12; + float xstep2 = -step * tri.dy23; + float xstep3 = -step * tri.dy31; + + float ystep1 = step * tri.dx12; + float ystep2 = step * tri.dx23; + float ystep3 = step * tri.dx31; + + float eo1 = tri.eo1 * step; + float eo2 = tri.eo2 * step; + float eo3 = tri.eo3 * step; + + int x, y; + + /* Subdivide space into NxM blocks, where each block is square and + * power-of-four in dimension. + * + * Trivially accept or reject blocks, else jump to per-pixel + * examination above. + */ + for (y = miny; y < maxy; y += step) + { + float cx1 = c1; + float cx2 = c2; + float cx3 = c3; + + for (x = minx; x < maxx; x += step) + { + if (cx1 + eo1 < 0 || + cx2 + eo2 < 0 || + cx3 + eo3 < 0) + { + } + else + { + do_quad(&tri, x, y, cx1, cx2, cx3); + } + + /* Iterate cx values across the region: + */ + cx1 += xstep1; + cx2 += xstep2; + cx3 += xstep3; + } + + /* Iterate c values down the region: + */ + c1 += ystep1; + c2 += ystep2; + c3 += ystep3; + } + } + else + { + const int step = BLOCKSIZE; + + float ei1 = tri.ei1 * step; + float ei2 = tri.ei2 * step; + float ei3 = tri.ei3 * step; + + float eo1 = tri.eo1 * step; + float eo2 = tri.eo2 * step; + float eo3 = tri.eo3 * step; + + float xstep1 = -step * tri.dy12; + float xstep2 = -step * tri.dy23; + float xstep3 = -step * tri.dy31; + + float ystep1 = step * tri.dx12; + float ystep2 = step * tri.dx23; + float ystep3 = step * tri.dx31; + int x, y; + + + /* Subdivide space into NxM blocks, where each block is square and + * power-of-four in dimension. + * + * Trivially accept or reject blocks, else jump to per-pixel + * examination above. + */ + for (y = miny; y < maxy; y += step) + { + float cx1 = c1; + float cx2 = c2; + float cx3 = c3; + boolean in = false; + + for (x = minx; x < maxx; x += step) + { + if (cx1 + eo1 < 0 || + cx2 + eo2 < 0 || + cx3 + eo3 < 0) + { + /* do nothing */ + if (in) + break; + } + else if (cx1 + ei1 > 0 && + cx2 + ei2 > 0 && + cx3 + ei3 > 0) + { + in = TRUE; + block_full(&tri, x, y); /* trivial accept */ + } + else + { + in = TRUE; + // block_full(&tri, x, y); /* trivial accept */ + do_block(&tri, x, y, cx1, cx2, cx3); + } + + /* Iterate cx values across the region: + */ + cx1 += xstep1; + cx2 += xstep2; + cx3 += xstep3; + } + + /* Iterate c values down the region: + */ + c1 += ystep1; + c2 += ystep2; + c3 += ystep3; + } + } +} + +static void triangle_cw( struct llvmpipe_context *llvmpipe, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4] ) +{ + do_triangle_ccw( llvmpipe, v1, v0, v2, !llvmpipe->ccw_is_frontface ); +} + +static void triangle_ccw( struct llvmpipe_context *llvmpipe, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4] ) +{ + do_triangle_ccw( llvmpipe, v0, v1, v2, llvmpipe->ccw_is_frontface ); +} + +static void triangle_both( struct llvmpipe_context *llvmpipe, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4] ) +{ + /* edge vectors e = v0 - v2, f = v1 - v2 */ + const float ex = v0[0][0] - v2[0][0]; + const float ey = v0[0][1] - v2[0][1]; + const float fx = v1[0][0] - v2[0][0]; + const float fy = v1[0][1] - v2[0][1]; + + /* det = cross(e,f).z */ + if (ex * fy - ey * fx < 0) + triangle_ccw( llvmpipe, v0, v1, v2 ); + else + triangle_cw( llvmpipe, v0, v1, v2 ); +} + +static void triangle_nop( struct llvmpipe_context *llvmpipe, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4] ) +{ +} + +/** + * Do setup for triangle rasterization, then render the triangle. + */ +void setup_prepare_tri( struct llvmpipe_context *llvmpipe ) +{ + llvmpipe->ccw_is_frontface = (llvmpipe->rasterizer->front_winding == + PIPE_WINDING_CW); + + switch (llvmpipe->rasterizer->cull_mode) { + case PIPE_WINDING_NONE: + llvmpipe->triangle = triangle_both; + break; + case PIPE_WINDING_CCW: + llvmpipe->triangle = triangle_cw; + break; + case PIPE_WINDING_CW: + llvmpipe->triangle = triangle_ccw; + break; + default: + llvmpipe->triangle = triangle_nop; + break; + } +} + + |