/**************************************************************************
 * 
 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 **************************************************************************/

/**
 * \author Brian Paul
 */


#include <transpose_matrix4x4.h>
#include "pipe/p_format.h"
#include "spu_main.h"
#include "spu_colorpack.h"
#include "spu_per_fragment_op.h"


#define LINEAR_QUAD_LAYOUT 1


static INLINE vector float
spu_min(vector float a, vector float b)
{
   vector unsigned int m;
   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
   return spu_sel(a, b, m);
}


static INLINE vector float
spu_max(vector float a, vector float b)
{
   vector unsigned int m;
   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
   return spu_sel(b, a, m);
}


/**
 * Called by rasterizer for each quad after the shader has run.  Do
 * all the per-fragment operations including alpha test, z test,
 * stencil test, blend, colormask and logicops.  This is a
 * fallback/debug function.  In reality we'll use a generated function
 * produced by the PPU.  But this function is useful for
 * debug/validation.
 */
void
spu_fallback_fragment_ops(uint x, uint y,
                          tile_t *colorTile,
                          tile_t *depthStencilTile,
                          vector float fragZ,
                          vector float fragR,
                          vector float fragG,
                          vector float fragB,
                          vector float fragA,
                          vector unsigned int mask)
{
   vector float frag_aos[4];
   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */

   /*
    * Do alpha test
    */
   if (spu.depth_stencil_alpha.alpha.enabled) {
      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref_value);
      vector unsigned int amask;

      switch (spu.depth_stencil_alpha.alpha.func) {
      case PIPE_FUNC_LESS:
         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
         break;
      case PIPE_FUNC_GREATER:
         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
         break;
      case PIPE_FUNC_GEQUAL:
         amask = spu_cmpgt(ref, fragA);
         amask = spu_nor(amask, amask);
         break;
      case PIPE_FUNC_LEQUAL:
         amask = spu_cmpgt(fragA, ref);
         amask = spu_nor(amask, amask);
         break;
      case PIPE_FUNC_EQUAL:
         amask = spu_cmpeq(ref, fragA);
         break;
      case PIPE_FUNC_NOTEQUAL:
         amask = spu_cmpeq(ref, fragA);
         amask = spu_nor(amask, amask);
         break;
      case PIPE_FUNC_ALWAYS:
         amask = spu_splats(0xffffffffU);
         break;
      case PIPE_FUNC_NEVER:
         amask = spu_splats( 0x0U);
         break;
      default:
         ;
      }

      mask = spu_and(mask, amask);
   }


   /*
    * Z and/or stencil testing...
    */
   if (spu.depth_stencil_alpha.depth.enabled ||
       spu.depth_stencil_alpha.stencil[0].enabled) {

      /* get four Z/Stencil values from tile */
      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
      vector unsigned int ifbS = spu_andc(ifbZS, mask24);

      if (spu.depth_stencil_alpha.stencil[0].enabled) {
         /* do stencil test */
         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);

      }
      else if (spu.depth_stencil_alpha.depth.enabled) {
         /* do depth test */

         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);

         vector unsigned int ifragZ;
         vector unsigned int zmask;

         /* convert four fragZ from float to uint */
         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
         ifragZ = spu_convtu(fragZ, 0);

         /* do depth comparison, setting zmask with results */
         switch (spu.depth_stencil_alpha.depth.func) {
         case PIPE_FUNC_LESS:
            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
            break;
         case PIPE_FUNC_GREATER:
            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
            break;
         case PIPE_FUNC_GEQUAL:
            zmask = spu_cmpgt(ifbZ, ifragZ);
            zmask = spu_nor(zmask, zmask);
            break;
         case PIPE_FUNC_LEQUAL:
            zmask = spu_cmpgt(ifragZ, ifbZ);
            zmask = spu_nor(zmask, zmask);
            break;
         case PIPE_FUNC_EQUAL:
            zmask = spu_cmpeq(ifbZ, ifragZ);
            break;
         case PIPE_FUNC_NOTEQUAL:
            zmask = spu_cmpeq(ifbZ, ifragZ);
            zmask = spu_nor(zmask, zmask);
            break;
         case PIPE_FUNC_ALWAYS:
            zmask = spu_splats(0xffffffffU);
            break;
         case PIPE_FUNC_NEVER:
            zmask = spu_splats( 0x0U);
            break;
         default:
            ;
         }

         mask = spu_and(mask, zmask);

         /* merge framebuffer Z and fragment Z according to the mask */
         ifbZ = spu_or(spu_and(ifragZ, mask),
                       spu_andc(ifbZ, mask));
      }

      if (spu_extract(spu_orx(mask), 0)) {
         /* put new fragment Z/Stencil values back into Z/Stencil tile */
         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);

         spu.cur_ztile_status = TILE_STATUS_DIRTY;
      }
   }


   /*
    * If we'll need the current framebuffer/tile colors for blending
    * or logicop or colormask, fetch them now.
    */
   if (spu.blend.blend_enable ||
       spu.blend.logicop_enable ||
       spu.blend.colormask != 0xf) {

#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
      fbc0 = colorTile->ui[y][x*2+0];
      fbc1 = colorTile->ui[y][x*2+1];
      fbc2 = colorTile->ui[y][x*2+2];
      fbc3 = colorTile->ui[y][x*2+3];
#else
      fbc0 = colorTile->ui[y+0][x+0];
      fbc1 = colorTile->ui[y+0][x+1];
      fbc2 = colorTile->ui[y+1][x+0];
      fbc3 = colorTile->ui[y+1][x+1];
#endif
   }


   /*
    * Do blending
    */
   if (spu.blend.blend_enable) {
      /* blending terms, misc regs */
      vector float term1r, term1g, term1b, term1a;
      vector float term2r, term2g, term2b, term2a;
      vector float one, tmp;

      vector float fbRGBA[4];  /* current framebuffer colors */

      /* convert framebuffer colors from packed int to vector float */
      {
         vector float temp[4]; /* float colors in AOS form */
         switch (spu.fb.color_format) {
         case PIPE_FORMAT_B8G8R8A8_UNORM:
            temp[0] = spu_unpack_B8G8R8A8(fbc0);
            temp[1] = spu_unpack_B8G8R8A8(fbc1);
            temp[2] = spu_unpack_B8G8R8A8(fbc2);
            temp[3] = spu_unpack_B8G8R8A8(fbc3);
            break;
         case PIPE_FORMAT_A8R8G8B8_UNORM:
            temp[0] = spu_unpack_A8R8G8B8(fbc0);
            temp[1] = spu_unpack_A8R8G8B8(fbc1);
            temp[2] = spu_unpack_A8R8G8B8(fbc2);
            temp[3] = spu_unpack_A8R8G8B8(fbc3);
            break;
         default:
            ASSERT(0);
         }
         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
      }

      /*
       * Compute Src RGB terms (fragment color * factor)
       */
      switch (spu.blend.rgb_src_factor) {
      case PIPE_BLENDFACTOR_ONE:
         term1r = fragR;
         term1g = fragG;
         term1b = fragB;
         break;
      case PIPE_BLENDFACTOR_ZERO:
         term1r =
         term1g =
         term1b = spu_splats(0.0f);
         break;
      case PIPE_BLENDFACTOR_SRC_COLOR:
         term1r = spu_mul(fragR, fragR);
         term1g = spu_mul(fragG, fragG);
         term1b = spu_mul(fragB, fragB);
         break;
      case PIPE_BLENDFACTOR_SRC_ALPHA:
         term1r = spu_mul(fragR, fragA);
         term1g = spu_mul(fragG, fragA);
         term1b = spu_mul(fragB, fragA);
         break;
      case PIPE_BLENDFACTOR_DST_COLOR:
         term1r = spu_mul(fragR, fbRGBA[0]);
         term1g = spu_mul(fragG, fbRGBA[1]);
         term1b = spu_mul(fragB, fbRGBA[1]);
         break;
      case PIPE_BLENDFACTOR_DST_ALPHA:
         term1r = spu_mul(fragR, fbRGBA[3]);
         term1g = spu_mul(fragG, fbRGBA[3]);
         term1b = spu_mul(fragB, fbRGBA[3]);
         break;
      case PIPE_BLENDFACTOR_CONST_COLOR:
         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
         break;
      case PIPE_BLENDFACTOR_CONST_ALPHA:
         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
         break;
      /* XXX more cases */
      default:
         ASSERT(0);
      }

      /*
       * Compute Src Alpha term (fragment alpha * factor)
       */
      switch (spu.blend.alpha_src_factor) {
      case PIPE_BLENDFACTOR_ONE:
         term1a = fragA;
         break;
      case PIPE_BLENDFACTOR_SRC_COLOR:
         term1a = spu_splats(0.0f);
         break;
      case PIPE_BLENDFACTOR_SRC_ALPHA:
         term1a = spu_mul(fragA, fragA);
         break;
      case PIPE_BLENDFACTOR_DST_COLOR:
         /* fall-through */
      case PIPE_BLENDFACTOR_DST_ALPHA:
         term1a = spu_mul(fragA, fbRGBA[3]);
         break;
      case PIPE_BLENDFACTOR_CONST_COLOR:
         /* fall-through */
      case PIPE_BLENDFACTOR_CONST_ALPHA:
         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
         break;
      /* XXX more cases */
      default:
         ASSERT(0);
      }

      /*
       * Compute Dest RGB terms (framebuffer color * factor)
       */
      switch (spu.blend.rgb_dst_factor) {
      case PIPE_BLENDFACTOR_ONE:
         term2r = fbRGBA[0];
         term2g = fbRGBA[1];
         term2b = fbRGBA[2];
         break;
      case PIPE_BLENDFACTOR_ZERO:
         term2r =
         term2g =
         term2b = spu_splats(0.0f);
         break;
      case PIPE_BLENDFACTOR_SRC_COLOR:
         term2r = spu_mul(fbRGBA[0], fragR);
         term2g = spu_mul(fbRGBA[1], fragG);
         term2b = spu_mul(fbRGBA[2], fragB);
         break;
      case PIPE_BLENDFACTOR_SRC_ALPHA:
         term2r = spu_mul(fbRGBA[0], fragA);
         term2g = spu_mul(fbRGBA[1], fragA);
         term2b = spu_mul(fbRGBA[2], fragA);
         break;
      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
         one = spu_splats(1.0f);
         tmp = spu_sub(one, fragA);
         term2r = spu_mul(fbRGBA[0], tmp);
         term2g = spu_mul(fbRGBA[1], tmp);
         term2b = spu_mul(fbRGBA[2], tmp);
         break;
      case PIPE_BLENDFACTOR_DST_COLOR:
         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
         break;
      case PIPE_BLENDFACTOR_DST_ALPHA:
         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
         break;
      case PIPE_BLENDFACTOR_CONST_COLOR:
         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
         break;
      case PIPE_BLENDFACTOR_CONST_ALPHA:
         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
         break;
       /* XXX more cases */
      default:
         ASSERT(0);
      }

      /*
       * Compute Dest Alpha term (framebuffer alpha * factor)
       */
      switch (spu.blend.alpha_dst_factor) {
      case PIPE_BLENDFACTOR_ONE:
         term2a = fbRGBA[3];
         break;
      case PIPE_BLENDFACTOR_SRC_COLOR:
         term2a = spu_splats(0.0f);
         break;
      case PIPE_BLENDFACTOR_SRC_ALPHA:
         term2a = spu_mul(fbRGBA[3], fragA);
         break;
      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
         one = spu_splats(1.0f);
         tmp = spu_sub(one, fragA);
         term2a = spu_mul(fbRGBA[3], tmp);
         break;
      case PIPE_BLENDFACTOR_DST_COLOR:
         /* fall-through */
      case PIPE_BLENDFACTOR_DST_ALPHA:
         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
         break;
      case PIPE_BLENDFACTOR_CONST_COLOR:
         /* fall-through */
      case PIPE_BLENDFACTOR_CONST_ALPHA:
         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
         break;
      /* XXX more cases */
      default:
         ASSERT(0);
      }

      /*
       * Combine Src/Dest RGB terms
       */
      switch (spu.blend.rgb_func) {
      case PIPE_BLEND_ADD:
         fragR = spu_add(term1r, term2r);
         fragG = spu_add(term1g, term2g);
         fragB = spu_add(term1b, term2b);
         break;
      case PIPE_BLEND_SUBTRACT:
         fragR = spu_sub(term1r, term2r);
         fragG = spu_sub(term1g, term2g);
         fragB = spu_sub(term1b, term2b);
         break;
      case PIPE_BLEND_REVERSE_SUBTRACT:
         fragR = spu_sub(term2r, term1r);
         fragG = spu_sub(term2g, term1g);
         fragB = spu_sub(term2b, term1b);
         break;
      case PIPE_BLEND_MIN:
         fragR = spu_min(term1r, term2r);
         fragG = spu_min(term1g, term2g);
         fragB = spu_min(term1b, term2b);
         break;
      case PIPE_BLEND_MAX:
         fragR = spu_max(term1r, term2r);
         fragG = spu_max(term1g, term2g);
         fragB = spu_max(term1b, term2b);
         break;
      default:
         ASSERT(0);
      }

      /*
       * Combine Src/Dest A term
       */
      switch (spu.blend.alpha_func) {
      case PIPE_BLEND_ADD:
         fragA = spu_add(term1a, term2a);
         break;
      case PIPE_BLEND_SUBTRACT:
         fragA = spu_sub(term1a, term2a);
         break;
      case PIPE_BLEND_REVERSE_SUBTRACT:
         fragA = spu_sub(term2a, term1a);
         break;
      case PIPE_BLEND_MIN:
         fragA = spu_min(term1a, term2a);
         break;
      case PIPE_BLEND_MAX:
         fragA = spu_max(term1a, term2a);
         break;
      default:
         ASSERT(0);
      }
   }


   /*
    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
    */
#if 0
   /* original code */
   {
      vector float frag_soa[4];
      frag_soa[0] = fragR;
      frag_soa[1] = fragG;
      frag_soa[2] = fragB;
      frag_soa[3] = fragA;
      _transpose_matrix4x4(frag_aos, frag_soa);
   }
#else
   /* short-cut relying on function parameter layout: */
   _transpose_matrix4x4(frag_aos, &fragR);
   (void) fragG;
   (void) fragB;
#endif

   /*
    * Pack fragment float colors into 32-bit RGBA words.
    */
   switch (spu.fb.color_format) {
   case PIPE_FORMAT_A8R8G8B8_UNORM:
      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
      break;
   case PIPE_FORMAT_B8G8R8A8_UNORM:
      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
      break;
   default:
      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
      ASSERT(0);
   }


   /*
    * Do color masking
    */
   if (spu.blend.colormask != 0xf) {
      uint cmask = 0x0; /* each byte corresponds to a color channel */

      /* Form bitmask depending on color buffer format and colormask bits */
      switch (spu.fb.color_format) {
      case PIPE_FORMAT_A8R8G8B8_UNORM:
         if (spu.blend.colormask & PIPE_MASK_R)
            cmask |= 0x00ff0000; /* red */
         if (spu.blend.colormask & PIPE_MASK_G)
            cmask |= 0x0000ff00; /* green */
         if (spu.blend.colormask & PIPE_MASK_B)
            cmask |= 0x000000ff; /* blue */
         if (spu.blend.colormask & PIPE_MASK_A)
            cmask |= 0xff000000; /* alpha */
         break;
      case PIPE_FORMAT_B8G8R8A8_UNORM:
         if (spu.blend.colormask & PIPE_MASK_R)
            cmask |= 0x0000ff00; /* red */
         if (spu.blend.colormask & PIPE_MASK_G)
            cmask |= 0x00ff0000; /* green */
         if (spu.blend.colormask & PIPE_MASK_B)
            cmask |= 0xff000000; /* blue */
         if (spu.blend.colormask & PIPE_MASK_A)
            cmask |= 0x000000ff; /* alpha */
         break;
      default:
         ASSERT(0);
      }

      /*
       * Apply color mask to the 32-bit packed colors.
       * if (cmask[i])
       *    frag color[i] = frag color[i];
       * else
       *    frag color[i] = framebuffer color[i];
       */
      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
   }


   /*
    * Do logic ops
    */
   if (spu.blend.logicop_enable) {
      /* XXX to do */
      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
   }


   /*
    * If mask is non-zero, mark tile as dirty.
    */
   if (spu_extract(spu_orx(mask), 0)) {
      spu.cur_ctile_status = TILE_STATUS_DIRTY;
   }
   else {
      /* write no fragments */
      return;
   }


   /*
    * Write new fragment/quad colors to the framebuffer/tile.
    * Only write pixels where the corresponding mask word is set.
    */
#if LINEAR_QUAD_LAYOUT
   /*
    * Quad layout:
    *  +--+--+--+--+
    *  |p0|p1|p2|p3|...
    *  +--+--+--+--+
    */
   if (spu_extract(mask, 0))
      colorTile->ui[y][x*2] = fragc0;
   if (spu_extract(mask, 1))
      colorTile->ui[y][x*2+1] = fragc1;
   if (spu_extract(mask, 2))
      colorTile->ui[y][x*2+2] = fragc2;
   if (spu_extract(mask, 3))
      colorTile->ui[y][x*2+3] = fragc3;
#else
   /*
    * Quad layout:
    *  +--+--+
    *  |p0|p1|...
    *  +--+--+
    *  |p2|p3|...
    *  +--+--+
    */
   if (spu_extract(mask, 0))
      colorTile->ui[y+0][x+0] = fragc0;
   if (spu_extract(mask, 1))
      colorTile->ui[y+0][x+1] = fragc1;
   if (spu_extract(mask, 2))
      colorTile->ui[y+1][x+0] = fragc2;
   if (spu_extract(mask, 3))
      colorTile->ui[y+1][x+1] = fragc3;
#endif
}