/*
 * (C) Copyright IBM Corporation 2008
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
 * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

/**
 * \file
 * Generate code to perform all per-fragment operations.
 *
 * Code generated by these functions perform both alpha, depth, and stencil
 * testing as well as alpha blending.
 *
 * \note
 * Occlusion query is not supported, but this is the right place to add that
 * support.
 *
 * \author Ian Romanick <idr@us.ibm.com>
 */

#include "pipe/p_defines.h"
#include "pipe/p_state.h"

#include "cell_context.h"

#include "rtasm/rtasm_ppc_spe.h"


/**
 * Generate code to perform alpha testing.
 *
 * The code generated by this function uses the register specificed by
 * \c mask as both an input and an output.
 *
 * \param dsa    Current alpha-test state
 * \param f      Function to which code should be appended
 * \param mask   Index of register containing active fragment mask
 * \param alphas Index of register containing per-fragment alpha values
 *
 * \note Emits a maximum of 6 instructions.
 */
static void
emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
                struct spe_function *f, int mask, int alphas)
{
   /* If the alpha function is either NEVER or ALWAYS, there is no need to
    * load the reference value into a register.  ALWAYS is a fairly common
    * case, and this optimization saves 2 instructions.
    */
   if (dsa->alpha.enabled
       && (dsa->alpha.func != PIPE_FUNC_NEVER)
       && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
      int ref = spe_allocate_available_register(f);
      int tmp_a = spe_allocate_available_register(f);
      int tmp_b = spe_allocate_available_register(f);
      union {
         float f;
         unsigned u;
      } ref_val;
      boolean complement = FALSE;

      ref_val.f = dsa->alpha.ref;

      spe_il(f, ref, ref_val.u & 0x0000ffff);
      spe_ilh(f, ref, ref_val.u >> 16);

      switch (dsa->alpha.func) {
      case PIPE_FUNC_NOTEQUAL:
         complement = TRUE;
         /* FALLTHROUGH */

      case PIPE_FUNC_EQUAL:
         spe_fceq(f, tmp_a, ref, alphas);
         break;

      case PIPE_FUNC_LEQUAL:
         complement = TRUE;
         /* FALLTHROUGH */

      case PIPE_FUNC_GREATER:
         spe_fcgt(f, tmp_a, ref, alphas);
         break;

      case PIPE_FUNC_LESS:
         complement = TRUE;
         /* FALLTHROUGH */

      case PIPE_FUNC_GEQUAL:
         spe_fcgt(f, tmp_a, ref, alphas);
         spe_fceq(f, tmp_b, ref, alphas);
         spe_or(f, tmp_a, tmp_b, tmp_a);
         break;

      case PIPE_FUNC_ALWAYS:
      case PIPE_FUNC_NEVER:
      default:
         assert(0);
         break;
      }

      if (complement) {
         spe_andc(f, mask, mask, tmp_a);
      } else {
         spe_and(f, mask, mask, tmp_a);
      }

      spe_release_register(f, ref);
      spe_release_register(f, tmp_a);
      spe_release_register(f, tmp_b);
   } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
      spe_il(f, mask, 0);
   }
}


/**
 * \param dsa        Current depth-test state
 * \param f          Function to which code should be appended
 * \param m          Mask of allocated / free SPE registers
 * \param mask       Index of register to contain depth-pass mask
 * \param stored     Index of register containing values from depth buffer
 * \param calculated Index of register containing per-fragment depth values
 *
 * \return
 * If the calculated depth comparison mask is the actual mask, \c FALSE is
 * returned.  If the calculated depth comparison mask is the compliment of
 * the actual mask, \c TRUE is returned.
 *
 * \note Emits a maximum of 3 instructions.
 */
static boolean
emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
                struct spe_function *f, int mask, int stored, int calculated)
{
   unsigned func = (dsa->depth.enabled)
       ? dsa->depth.func : PIPE_FUNC_ALWAYS;
   int tmp = spe_allocate_available_register(f);
   boolean compliment = FALSE;

   switch (func) {
   case PIPE_FUNC_NEVER:
      spe_il(f, mask, 0);
      break;

   case PIPE_FUNC_NOTEQUAL:
      compliment = TRUE;
      /* FALLTHROUGH */
   case PIPE_FUNC_EQUAL:
      spe_ceq(f, mask, calculated, stored);
      break;

   case PIPE_FUNC_LEQUAL:
      compliment = TRUE;
      /* FALLTHROUGH */
   case PIPE_FUNC_GREATER:
      spe_clgt(f, mask, calculated, stored);
      break;

   case PIPE_FUNC_LESS:
      compliment = TRUE;
      /* FALLTHROUGH */
   case PIPE_FUNC_GEQUAL:
      spe_clgt(f, mask, calculated, stored);
      spe_ceq(f, tmp, calculated, stored);
      spe_or(f, mask, mask, tmp);
      break;

   case PIPE_FUNC_ALWAYS:
      spe_il(f, mask, ~0);
      break;

   default:
      assert(0);
      break;
   }

   spe_release_register(f, tmp);
   return compliment;
}


/**
 * \note Emits a maximum of 5 instructions.
 *
 * \warning
 * Since \c out and \c in might be the same register, this routine cannot
 * generate code that uses \c out as a temporary.
 */
static void
emit_stencil_op(struct spe_function *f,
                int out, int in, int mask, unsigned op, unsigned ref)
{
   const int clamp = spe_allocate_available_register(f);
   const int clamp_mask = spe_allocate_available_register(f);
   const int result = spe_allocate_available_register(f);

   switch(op) {
   case PIPE_STENCIL_OP_KEEP:
      assert(0);
   case PIPE_STENCIL_OP_ZERO:
      spe_il(f, result, 0);
      break;
   case PIPE_STENCIL_OP_REPLACE:
      spe_il(f, result, ref);
      break;
   case PIPE_STENCIL_OP_INCR:
      spe_il(f, clamp, 0x0ff);
      spe_ai(f, result, in, 1);
      spe_clgti(f, clamp_mask, result, 0x0ff);
      spe_selb(f, result, result, clamp, clamp_mask);
      break;
   case PIPE_STENCIL_OP_DECR:
      spe_il(f, clamp, 0);
      spe_ai(f, result, in, -1);

      /* If "(s-1) < 0" in signed arithemtic, then "(s-1) > MAX" in unsigned
       * arithmetic.
       */
      spe_clgti(f, clamp_mask, result, 0x0ff);
      spe_selb(f, result, result, clamp, clamp_mask);
      break;
   case PIPE_STENCIL_OP_INCR_WRAP:
      spe_ai(f, result, in, 1);
      break;
   case PIPE_STENCIL_OP_DECR_WRAP:
      spe_ai(f, result, in, -1);
      break;
   case PIPE_STENCIL_OP_INVERT:
      spe_nor(f, result, in, in);
      break;
   default:
      assert(0);
   }

   spe_selb(f, out, in, result, mask);

   spe_release_register(f, result);
   spe_release_register(f, clamp_mask);
   spe_release_register(f, clamp);
}


/**
 * \param dsa        Depth / stencil test state
 * \param face       0 for front face, 1 for back face
 * \param f          Function to append instructions to
 * \param reg_mask   Mask of allocated registers
 * \param mask       Register containing mask of fragments passing the
 *                   alpha test
 * \param depth_mask Register containing mask of fragments passing the
 *                   depth test
 * \param depth_compliment  Is \c depth_mask the compliment of the actual mask?
 * \param stencil    Register containing values from stencil buffer
 * \param depth_pass Register to store mask of fragments passing stencil test
 *                   and depth test
 *
 * \note
 * Emits a maximum of 10 + (3 * 5) = 25 instructions.
 */
static int
emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
                  unsigned face,
                  struct spe_function *f,
                  int mask,
                  int depth_mask,
                  boolean depth_complement,
                  int stencil,
                  int depth_pass)
{
   int stencil_fail = spe_allocate_available_register(f);
   int depth_fail = spe_allocate_available_register(f);
   int stencil_mask = spe_allocate_available_register(f);
   int stencil_pass = spe_allocate_available_register(f);
   int face_stencil = spe_allocate_available_register(f);
   int stencil_src = stencil;
   const unsigned ref = (dsa->stencil[face].ref_value
                         & dsa->stencil[face].value_mask);
   boolean complement = FALSE;
   int stored;
   int tmp = spe_allocate_available_register(f);


   if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
       && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
       && (dsa->stencil[face].value_mask != 0x0ff)) {
      stored = spe_allocate_available_register(f);
      spe_andi(f, stored, stencil, dsa->stencil[face].value_mask);
   } else {
      stored = stencil;
   }


   switch (dsa->stencil[face].func) {
   case PIPE_FUNC_NEVER:
      spe_il(f, stencil_mask, 0);
      break;

   case PIPE_FUNC_NOTEQUAL:
      complement = TRUE;
      /* FALLTHROUGH */
   case PIPE_FUNC_EQUAL:
      spe_ceqi(f, stencil_mask, stored, ref);
      break;

   case PIPE_FUNC_LEQUAL:
      complement = TRUE;
      /* FALLTHROUGH */
   case PIPE_FUNC_GREATER:
      spe_clgti(f, stencil_mask, stored, ref);
      break;

   case PIPE_FUNC_LESS:
      complement = TRUE;
      /* FALLTHROUGH */
   case PIPE_FUNC_GEQUAL:
      spe_clgti(f, stencil_mask, stored, ref);
      spe_ceqi(f, tmp, stored, ref);
      spe_or(f, stencil_mask, stencil_mask, tmp);
      break;

   case PIPE_FUNC_ALWAYS:
      /* See comment below. */
      break;

   default:
      assert(0);
      break;
   }

   if (stored != stencil) {
      spe_release_register(f, stored);
   }
   spe_release_register(f, tmp);


   /* ALWAYS is a very common stencil-test, so some effort is applied to
    * optimize that case.  The stencil-pass mask is the same as the input
    * fragment mask.  This makes the stencil-test (above) a no-op, and the
    * input fragment mask can be "renamed" the stencil-pass mask.
    */
   if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
      spe_release_register(f, stencil_pass);
      stencil_pass = mask;
   } else {
      if (complement) {
         spe_andc(f, stencil_pass, mask, stencil_mask);
      } else {
         spe_and(f, stencil_pass, mask, stencil_mask);
      }
   }

   if (depth_complement) {
      spe_andc(f, depth_pass, stencil_pass, depth_mask);
   } else {
      spe_and(f, depth_pass, stencil_pass, depth_mask);
   }


   /* Conditionally emit code to update the stencil value under various
    * condititons.  Note that there is no need to generate code under the
    * following circumstances:
    *
    * - Stencil write mask is zero.
    * - For stencil-fail if the stencil test is ALWAYS
    * - For depth-fail if the stencil test is NEVER
    * - For depth-pass if the stencil test is NEVER
    * - Any of the 3 conditions if the operation is KEEP
    */
   if (dsa->stencil[face].write_mask != 0) {
      if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
          && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
         if (complement) {
            spe_and(f, stencil_fail, mask, stencil_mask);
         } else {
            spe_andc(f, stencil_fail, mask, stencil_mask);
         }

         emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
                         dsa->stencil[face].fail_op,
                         dsa->stencil[face].ref_value);

         stencil_src = face_stencil;
      }

      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
          && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
         if (depth_complement) {
            spe_and(f, depth_fail, stencil_pass, depth_mask);
         } else {
            spe_andc(f, depth_fail, stencil_pass, depth_mask);
         }

         emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
                         dsa->stencil[face].zfail_op,
                         dsa->stencil[face].ref_value);
         stencil_src = face_stencil;
      }

      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
          && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
         emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
                         dsa->stencil[face].zpass_op,
                         dsa->stencil[face].ref_value);
         stencil_src = face_stencil;
      }
   }

   spe_release_register(f, stencil_fail);
   spe_release_register(f, depth_fail);
   spe_release_register(f, stencil_mask);
   if (stencil_pass != mask) {
      spe_release_register(f, stencil_pass);
   }

   /* If all of the stencil operations were KEEP or the stencil write mask was
    * zero, "stencil_src" will still be set to "stencil".  In this case
    * release the "face_stencil" register.  Otherwise apply the stencil write
    * mask to select bits from the calculated stencil value and the previous
    * stencil value.
    */
   if (stencil_src == stencil) {
      spe_release_register(f, face_stencil);
   } else if (dsa->stencil[face].write_mask != 0x0ff) {
      int tmp = spe_allocate_available_register(f);

      spe_il(f, tmp, dsa->stencil[face].write_mask);
      spe_selb(f, stencil_src, stencil, stencil_src, tmp);

      spe_release_register(f, tmp);
   }

   return stencil_src;
}


void
cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
{
   struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
   struct spe_function *const f = &cdsa->code;

   /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
    * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
    * up to 64 to make it a happy power-of-two.
    */
   spe_init_func(f, 4 * 64);


   /* Allocate registers for the function's input parameters.  Cleverly (and
    * clever code is usually dangerous, but I couldn't resist) the generated
    * function returns a structure.  Returned structures start with register
    * 3, and the structure fields are ordered to match up exactly with the
    * input parameters.
    */
   int mask = spe_allocate_register(f, 3);
   int depth = spe_allocate_register(f, 4);
   int stencil = spe_allocate_register(f, 5);
   int zvals = spe_allocate_register(f, 6);
   int frag_a = spe_allocate_register(f, 7);
   int facing = spe_allocate_register(f, 8);

   int depth_mask = spe_allocate_available_register(f);

   boolean depth_complement;


   emit_alpha_test(dsa, f, mask, frag_a);

   depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);

   if (dsa->stencil[0].enabled) {
      const int front_depth_pass = spe_allocate_available_register(f);
      int front_stencil = emit_stencil_test(dsa, 0, f, mask,
                                            depth_mask, depth_complement,
                                            stencil, front_depth_pass);

      if (dsa->stencil[1].enabled) {
         const int back_depth_pass = spe_allocate_available_register(f);
         int back_stencil = emit_stencil_test(dsa, 1, f, mask,
                                              depth_mask,  depth_complement,
                                              stencil, back_depth_pass);

         /* If the front facing stencil value and the back facing stencil
          * value are stored in the same register, there is no need to select
          * a value based on the facing.  This can happen if the stencil value
          * was not modified due to the write masks being zero, the stencil
          * operations being KEEP, etc.
          */
         if (front_stencil != back_stencil) {
            spe_selb(f, stencil, back_stencil, front_stencil, facing);
         }

         if (back_stencil != stencil) {
            spe_release_register(f, back_stencil);
         }

         if (front_stencil != stencil) {
            spe_release_register(f, front_stencil);
         }

         spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);

         spe_release_register(f, back_depth_pass);
      } else {
         if (front_stencil != stencil) {
            spe_or(f, stencil, front_stencil, front_stencil);
            spe_release_register(f, front_stencil);
         }
         spe_or(f, mask, front_depth_pass, front_depth_pass);
      }

      spe_release_register(f, front_depth_pass);
   } else if (dsa->depth.enabled) {
      if (depth_complement) {
         spe_andc(f, mask, mask, depth_mask);
      } else {
         spe_and(f, mask, mask, depth_mask);
      }
   }

   if (dsa->depth.writemask) {
         spe_selb(f, depth, depth, zvals, mask);
   }

   spe_bi(f, 0, 0, 0);


#if 0
   {
      const uint32_t *p = f->store;
      unsigned i;

      printf("# alpha (%sabled)\n",
             (dsa->alpha.enabled) ? "en" : "dis");
      printf("#    func: %u\n", dsa->alpha.func);
      printf("#    ref: %.2f\n", dsa->alpha.ref);

      printf("# depth (%sabled)\n",
             (dsa->depth.enabled) ? "en" : "dis");
      printf("#    func: %u\n", dsa->depth.func);

      for (i = 0; i < 2; i++) {
         printf("# %s stencil (%sabled)\n",
                (i == 0) ? "front" : "back",
                (dsa->stencil[i].enabled) ? "en" : "dis");

         printf("#    func: %u\n", dsa->stencil[i].func);
         printf("#    op (sf, zf, zp): %u %u %u\n",
                dsa->stencil[i].fail_op,
                dsa->stencil[i].zfail_op,
                dsa->stencil[i].zpass_op);
         printf("#    ref value / value mask / write mask: %02x %02x %02x\n",
                dsa->stencil[i].ref_value,
                dsa->stencil[i].value_mask,
                dsa->stencil[i].write_mask);
      }

      printf("\t.text\n");
      for (/* empty */; p < f->csr; p++) {
         printf("\t.long\t0x%04x\n", *p);
      }
      fflush(stdout);
   }
#endif
}


/**
 * \note Emits a maximum of 3 instructions
 */
static int
emit_alpha_factor_calculation(struct spe_function *f,
                              unsigned factor,
                              int src_alpha, int dst_alpha, int const_alpha)
{
   int factor_reg;
   int tmp;


   switch (factor) {
   case PIPE_BLENDFACTOR_ONE:
      factor_reg = -1;
      break;

   case PIPE_BLENDFACTOR_SRC_ALPHA:
      factor_reg = spe_allocate_available_register(f);

      spe_or(f, factor_reg, src_alpha, src_alpha);
      break;

   case PIPE_BLENDFACTOR_DST_ALPHA:
      factor_reg = dst_alpha;
      break;

   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
      factor_reg = -1;
      break;

   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
      factor_reg = spe_allocate_available_register(f);

      tmp = spe_allocate_available_register(f);
      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);
      spe_fs(f, factor_reg, tmp, const_alpha);
      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_CONST_ALPHA:
      factor_reg = const_alpha;
      break;

   case PIPE_BLENDFACTOR_ZERO:
      factor_reg = -1;
      break;

   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
      tmp = spe_allocate_available_register(f);
      factor_reg = spe_allocate_available_register(f);

      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);
      spe_fs(f, factor_reg, tmp, src_alpha);

      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
      tmp = spe_allocate_available_register(f);
      factor_reg = spe_allocate_available_register(f);

      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);
      spe_fs(f, factor_reg, tmp, dst_alpha);

      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_SRC1_ALPHA:
   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
   default:
      assert(0);
      factor_reg = -1;
      break;
   }

   return factor_reg;
}


/**
 * \note Emits a maximum of 6 instructions
 */
static void
emit_color_factor_calculation(struct spe_function *f,
                              unsigned sF, unsigned mask,
                              const int *src,
                              const int *dst,
                              const int *const_color,
                              int *factor)
{
   int tmp;
   unsigned i;


   factor[0] = -1;
   factor[1] = -1;
   factor[2] = -1;
   factor[3] = -1;

   switch (sF) {
   case PIPE_BLENDFACTOR_ONE:
      break;

   case PIPE_BLENDFACTOR_SRC_COLOR:
      for (i = 0; i < 3; ++i) {
         if ((mask & (1U << i)) != 0) {
            factor[i] = spe_allocate_available_register(f);
            spe_or(f, factor[i], src[i], src[i]);
         }
      }
      break;

   case PIPE_BLENDFACTOR_SRC_ALPHA:
      factor[0] = spe_allocate_available_register(f);
      factor[1] = factor[0];
      factor[2] = factor[0];

      spe_or(f, factor[0], src[3], src[3]);
      break;

   case PIPE_BLENDFACTOR_DST_ALPHA:
      factor[0] = dst[3];
      factor[1] = dst[3];
      factor[2] = dst[3];
      break;

   case PIPE_BLENDFACTOR_DST_COLOR:
      factor[0] = dst[0];
      factor[1] = dst[1];
      factor[2] = dst[2];
      break;

   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
      tmp = spe_allocate_available_register(f);
      factor[0] = spe_allocate_available_register(f);
      factor[1] = factor[0];
      factor[2] = factor[0];

      /* Alpha saturate means min(As, 1-Ad).
       */
      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);
      spe_fs(f, tmp, tmp, dst[3]);
      spe_fcgt(f, factor[0], tmp, src[3]);
      spe_selb(f, factor[0], src[3], tmp, factor[0]);

      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
      tmp = spe_allocate_available_register(f);
      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);

      for (i = 0; i < 3; i++) {
         factor[i] = spe_allocate_available_register(f);

         spe_fs(f, factor[i], tmp, const_color[i]);
      }
      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_CONST_COLOR:
      for (i = 0; i < 3; i++) {
         factor[i] = const_color[i];
      }
      break;

   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
      factor[0] = spe_allocate_available_register(f);
      factor[1] = factor[0];
      factor[2] = factor[0];

      tmp = spe_allocate_available_register(f);
      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);
      spe_fs(f, factor[0], tmp, const_color[3]);
      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_CONST_ALPHA:
      factor[0] = const_color[3];
      factor[1] = factor[0];
      factor[2] = factor[0];
      break;

   case PIPE_BLENDFACTOR_ZERO:
      break;

   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
      tmp = spe_allocate_available_register(f);

      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);

      for (i = 0; i < 3; ++i) {
         if ((mask & (1U << i)) != 0) {
            factor[i] = spe_allocate_available_register(f);
            spe_fs(f, factor[i], tmp, src[i]);
         }
      }

      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
      tmp = spe_allocate_available_register(f);
      factor[0] = spe_allocate_available_register(f);
      factor[1] = factor[0];
      factor[2] = factor[0];

      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);
      spe_fs(f, factor[0], tmp, src[3]);

      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
      tmp = spe_allocate_available_register(f);
      factor[0] = spe_allocate_available_register(f);
      factor[1] = factor[0];
      factor[2] = factor[0];

      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);
      spe_fs(f, factor[0], tmp, dst[3]);

      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_INV_DST_COLOR:
      tmp = spe_allocate_available_register(f);

      spe_il(f, tmp, 1);
      spe_cuflt(f, tmp, tmp, 0);

      for (i = 0; i < 3; ++i) {
         if ((mask & (1U << i)) != 0) {
            factor[i] = spe_allocate_available_register(f);
            spe_fs(f, factor[i], tmp, dst[i]);
         }
      }

      spe_release_register(f, tmp);
      break;

   case PIPE_BLENDFACTOR_SRC1_COLOR:
   case PIPE_BLENDFACTOR_SRC1_ALPHA:
   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
   default:
      assert(0);
   }
}


static void
emit_blend_calculation(struct spe_function *f,
                       unsigned func, unsigned sF, unsigned dF,
                       int src, int src_factor, int dst, int dst_factor)
{
   int tmp = spe_allocate_available_register(f);

   switch (func) {
   case PIPE_BLEND_ADD:
      if (sF == PIPE_BLENDFACTOR_ONE) {
         if (dF == PIPE_BLENDFACTOR_ZERO) {
            /* Do nothing. */
         } else if (dF == PIPE_BLENDFACTOR_ONE) {
            spe_fa(f, src, src, dst);
         }
      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
         if (dF == PIPE_BLENDFACTOR_ZERO) {
            spe_il(f, src, 0);
         } else if (dF == PIPE_BLENDFACTOR_ONE) {
            spe_or(f, src, dst, dst);
         } else {
            spe_fm(f, src, dst, dst_factor);
         }
      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
         spe_fm(f, src, src, src_factor);
      } else {
         spe_fm(f, tmp, dst, dst_factor);
         spe_fma(f, src, src, src_factor, tmp);
      }
      break;

   case PIPE_BLEND_SUBTRACT:
      if (sF == PIPE_BLENDFACTOR_ONE) {
         if (dF == PIPE_BLENDFACTOR_ZERO) {
            /* Do nothing. */
         } else if (dF == PIPE_BLENDFACTOR_ONE) {
            spe_fs(f, src, src, dst);
         }
      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
         if (dF == PIPE_BLENDFACTOR_ZERO) {
            spe_il(f, src, 0);
         } else if (dF == PIPE_BLENDFACTOR_ONE) {
            spe_il(f, tmp, 0);
            spe_fs(f, src, tmp, dst);
         } else {
            spe_fm(f, src, dst, dst_factor);
         }
      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
         spe_fm(f, src, src, src_factor);
      } else {
         spe_fm(f, tmp, dst, dst_factor);
         spe_fms(f, src, src, src_factor, tmp);
      }
      break;

   case PIPE_BLEND_REVERSE_SUBTRACT:
      if (sF == PIPE_BLENDFACTOR_ONE) {
         if (dF == PIPE_BLENDFACTOR_ZERO) {
            spe_il(f, tmp, 0);
            spe_fs(f, src, tmp, src);
         } else if (dF == PIPE_BLENDFACTOR_ONE) {
            spe_fs(f, src, dst, src);
         }
      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
         if (dF == PIPE_BLENDFACTOR_ZERO) {
            spe_il(f, src, 0);
         } else if (dF == PIPE_BLENDFACTOR_ONE) {
            spe_or(f, src, dst, dst);
         } else {
            spe_fm(f, src, dst, dst_factor);
         }
      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
         spe_fm(f, src, src, src_factor);
      } else {
         spe_fm(f, tmp, src, src_factor);
         spe_fms(f, src, src, dst_factor, tmp);
      }
      break;

   case PIPE_BLEND_MIN:
      spe_cgt(f, tmp, src, dst);
      spe_selb(f, src, src, dst, tmp);
      break;

   case PIPE_BLEND_MAX:
      spe_cgt(f, tmp, src, dst);
      spe_selb(f, src, dst, src, tmp);
      break;

   default:
      assert(0);
   }

   spe_release_register(f, tmp);
}


/**
 * Generate code to perform alpha blending on the SPE
 */
void
cell_generate_alpha_blend(struct cell_blend_state *cb)
{
   struct pipe_blend_state *const b = &cb->base;
   struct spe_function *const f = &cb->code;

   /* This code generates a maximum of 3 (source alpha factor)
    * + 3 (destination alpha factor) + (3 * 6) (source color factor)
    * + (3 * 6) (destination color factor) + (4 * 2) (blend equation)
    * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
    * make it a happy power-of-two.
    */
   spe_init_func(f, 4 * 64);


   const int frag[4] = {
      spe_allocate_register(f, 3),
      spe_allocate_register(f, 4),
      spe_allocate_register(f, 5),
      spe_allocate_register(f, 6),
   };
   const int pixel[4] = {
      spe_allocate_register(f, 7),
      spe_allocate_register(f, 8),
      spe_allocate_register(f, 9),
      spe_allocate_register(f, 10),
   };
   const int const_color[4] = {
      spe_allocate_register(f, 11),
      spe_allocate_register(f, 12),
      spe_allocate_register(f, 13),
      spe_allocate_register(f, 14),
   };
   unsigned func[4];
   unsigned sF[4];
   unsigned dF[4];
   unsigned i;
   int src_factor[4];
   int dst_factor[4];


   /* Does the selected blend mode make use of the source / destination
    * color (RGB) blend factors?
    */
   boolean need_color_factor = b->blend_enable
       && (b->rgb_func != PIPE_BLEND_MIN)
       && (b->rgb_func != PIPE_BLEND_MAX);

   /* Does the selected blend mode make use of the source / destination
    * alpha blend factors?
    */
   boolean need_alpha_factor = b->blend_enable
       && (b->alpha_func != PIPE_BLEND_MIN)
       && (b->alpha_func != PIPE_BLEND_MAX);


   if (b->blend_enable) {
      sF[0] = b->rgb_src_factor;
      sF[1] = sF[0];
      sF[2] = sF[0];
      switch (b->alpha_src_factor & 0x0f) {
      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
         sF[3] = PIPE_BLENDFACTOR_ONE;
         break;
      case PIPE_BLENDFACTOR_SRC_COLOR:
      case PIPE_BLENDFACTOR_DST_COLOR:
      case PIPE_BLENDFACTOR_CONST_COLOR:
      case PIPE_BLENDFACTOR_SRC1_COLOR:
         sF[3] = b->alpha_src_factor + 1;
         break;
      default:
         sF[3] = b->alpha_src_factor;
      }

      dF[0] = b->rgb_dst_factor;
      dF[1] = dF[0];
      dF[2] = dF[0];
      switch (b->alpha_dst_factor & 0x0f) {
      case PIPE_BLENDFACTOR_SRC_COLOR:
      case PIPE_BLENDFACTOR_DST_COLOR:
      case PIPE_BLENDFACTOR_CONST_COLOR:
      case PIPE_BLENDFACTOR_SRC1_COLOR:
         dF[3] = b->alpha_dst_factor + 1;
         break;
      default:
         dF[3] = b->alpha_dst_factor;
      }

      func[0] = b->rgb_func;
      func[1] = func[0];
      func[2] = func[0];
      func[3] = b->alpha_func;
   } else {
      sF[0] = PIPE_BLENDFACTOR_ONE;
      sF[1] = PIPE_BLENDFACTOR_ONE;
      sF[2] = PIPE_BLENDFACTOR_ONE;
      sF[3] = PIPE_BLENDFACTOR_ONE;
      dF[0] = PIPE_BLENDFACTOR_ZERO;
      dF[1] = PIPE_BLENDFACTOR_ZERO;
      dF[2] = PIPE_BLENDFACTOR_ZERO;
      dF[3] = PIPE_BLENDFACTOR_ZERO;

      func[0] = PIPE_BLEND_ADD;
      func[1] = PIPE_BLEND_ADD;
      func[2] = PIPE_BLEND_ADD;
      func[3] = PIPE_BLEND_ADD;
   }


   /* If alpha writing is enabled and the alpha blend mode requires use of
    * the alpha factor, calculate the alpha factor.
    */
   if (((b->colormask & 8) != 0) && need_alpha_factor) {
      src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3],
                                                    frag[3], pixel[3]);

      /* If the alpha destination blend factor is the same as the alpha source
       * blend factor, re-use the previously calculated value.
       */
      dst_factor[3] = (dF[3] == sF[3])
          ? src_factor[3]
          : emit_alpha_factor_calculation(f, dF[3], const_color[3],
                                          frag[3], pixel[3]);
   }


   if (sF[0] == sF[3]) {
      src_factor[0] = src_factor[3];
      src_factor[1] = src_factor[3];
      src_factor[2] = src_factor[3];
   } else if (sF[0] == dF[3]) {
      src_factor[0] = dst_factor[3];
      src_factor[1] = dst_factor[3];
      src_factor[2] = dst_factor[3];
   } else if (need_color_factor) {
      emit_color_factor_calculation(f,
                                    b->rgb_src_factor,
                                    b->colormask,
                                    frag, pixel, const_color, src_factor);
   }


   if (dF[0] == sF[3]) {
      dst_factor[0] = src_factor[3];
      dst_factor[1] = src_factor[3];
      dst_factor[2] = src_factor[3];
   } else if (dF[0] == dF[3]) {
      dst_factor[0] = dst_factor[3];
      dst_factor[1] = dst_factor[3];
      dst_factor[2] = dst_factor[3];
   } else if (dF[0] == sF[0]) {
      dst_factor[0] = src_factor[0];
      dst_factor[1] = src_factor[1];
      dst_factor[2] = src_factor[2];
   } else if (need_color_factor) {
      emit_color_factor_calculation(f,
                                    b->rgb_dst_factor,
                                    b->colormask,
                                    frag, pixel, const_color, dst_factor);
   }



   for (i = 0; i < 4; ++i) {
      if ((b->colormask & (1U << i)) != 0) {
         emit_blend_calculation(f,
                                func[i], sF[i], dF[i],
                                frag[i], src_factor[i],
                                pixel[i], dst_factor[i]);
      }
   }

   spe_bi(f, 0, 0, 0);

#if 0
   {
      const uint32_t *p = f->store;

      printf("# %u instructions\n", f->csr - f->store);
      printf("# blend (%sabled)\n",
             (cb->base.blend_enable) ? "en" : "dis");
      printf("#    RGB func / sf / df: %u %u %u\n",
             cb->base.rgb_func,
             cb->base.rgb_src_factor,
             cb->base.rgb_dst_factor);
      printf("#    ALP func / sf / df: %u %u %u\n",
             cb->base.alpha_func,
             cb->base.alpha_src_factor,
             cb->base.alpha_dst_factor);

      printf("\t.text\n");
      for (/* empty */; p < f->csr; p++) {
         printf("\t.long\t0x%04x\n", *p);
      }
      fflush(stdout);
   }
#endif
}


int PC_OFFSET(const struct spe_function *f, const void *d)
{
   const intptr_t pc = (intptr_t) f->csr;
   const intptr_t ea = ~0x0f & (intptr_t) d;

   return (ea - pc) >> 2;
}


/**
 * Generate code to perform color conversion and logic op
 *
 * \bug
 * The code generated by this function should also perform dithering.
 *
 * \bug
 * The code generated by this function should also perform color-write
 * masking.
 *
 * \bug
 * Only two framebuffer formats are supported at this time.
 */
void
cell_generate_logic_op(struct spe_function *f,
                       const struct pipe_blend_state *blend,
                       struct pipe_surface *surf)
{
   const unsigned logic_op = (blend->logicop_enable)
       ? blend->logicop_func : PIPE_LOGICOP_COPY;

   /* This code generates a maximum of 37 instructions.  An additional 32
    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
    * to 64 to make it a happy power-of-two.
    */
   spe_init_func(f, 4 * 64);


   /* Pixel colors in framebuffer format in AoS layout.
    */
   const int pixel[4] = {
      spe_allocate_register(f, 3),
      spe_allocate_register(f, 4),
      spe_allocate_register(f, 5),
      spe_allocate_register(f, 6),
   };

   /* Fragment colors stored as floats in SoA layout.
    */
   const int frag[4] = {
      spe_allocate_register(f, 7),
      spe_allocate_register(f, 8),
      spe_allocate_register(f, 9),
      spe_allocate_register(f, 10),
   };

   const int mask = spe_allocate_register(f, 11);


   /* Short-circuit the noop and invert cases.
    */
   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) {
      spe_bi(f, 0, 0, 0);
      return;
   } else if (logic_op == PIPE_LOGICOP_INVERT) {
      spe_nor(f, pixel[0], pixel[0], pixel[0]);
      spe_nor(f, pixel[1], pixel[1], pixel[1]);
      spe_nor(f, pixel[2], pixel[2], pixel[2]);
      spe_nor(f, pixel[3], pixel[3], pixel[3]);
      spe_bi(f, 0, 0, 0);
      return;
   }


   const int tmp[4] = {
      spe_allocate_available_register(f),
      spe_allocate_available_register(f),
      spe_allocate_available_register(f),
      spe_allocate_available_register(f),
   };

   const int shuf_xpose_hi = spe_allocate_available_register(f);
   const int shuf_xpose_lo = spe_allocate_available_register(f);
   const int shuf_color = spe_allocate_available_register(f);


   /* Pointer to the begining of the function's private data area.
    */
   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);


   /* Convert fragment colors to framebuffer format in AoS layout.
    */
   switch (surf->format) {
   case PIPE_FORMAT_A8R8G8B8_UNORM:
      data[0] = 0x00010203;
      data[1] = 0x10111213;
      data[2] = 0x04050607;
      data[3] = 0x14151617;
      data[4] = 0x0c000408;
      data[5] = 0x80808080;
      data[6] = 0x80808080;
      data[7] = 0x80808080;
      break;
   case PIPE_FORMAT_B8G8R8A8_UNORM:
      data[0] = 0x03020100;
      data[1] = 0x13121110;
      data[2] = 0x07060504;
      data[3] = 0x17161514;
      data[4] = 0x0804000c;
      data[5] = 0x80808080;
      data[6] = 0x80808080;
      data[7] = 0x80808080;
      break;
   default:
      fprintf(stderr, "CELL: Bad pixel format in cell_generate_logic_op()");
      ASSERT(0);
   }

   spe_ilh(f, tmp[0], 0x0808);
   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);

   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);

   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);

   spe_cfltu(f, frag[0], frag[0], 32);
   spe_cfltu(f, frag[1], frag[1], 32);
   spe_cfltu(f, frag[2], frag[2], 32);
   spe_cfltu(f, frag[3], frag[3], 32);

   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);


   /* If logic op is enabled, perform the requested logical operation on the
    * converted fragment colors and the pixel colors.
    */
   switch (logic_op) {
   case PIPE_LOGICOP_CLEAR:
      spe_il(f, frag[0], 0);
      spe_il(f, frag[1], 0);
      spe_il(f, frag[2], 0);
      spe_il(f, frag[3], 0);
      break;
   case PIPE_LOGICOP_NOR:
      spe_nor(f, frag[0], frag[0], pixel[0]);
      spe_nor(f, frag[1], frag[1], pixel[1]);
      spe_nor(f, frag[2], frag[2], pixel[2]);
      spe_nor(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_AND_INVERTED:
      spe_andc(f, frag[0], pixel[0], frag[0]);
      spe_andc(f, frag[1], pixel[1], frag[1]);
      spe_andc(f, frag[2], pixel[2], frag[2]);
      spe_andc(f, frag[3], pixel[3], frag[3]);
      break;
   case PIPE_LOGICOP_COPY_INVERTED:
      spe_nor(f, frag[0], frag[0], frag[0]);
      spe_nor(f, frag[1], frag[1], frag[1]);
      spe_nor(f, frag[2], frag[2], frag[2]);
      spe_nor(f, frag[3], frag[3], frag[3]);
      break;
   case PIPE_LOGICOP_AND_REVERSE:
      spe_andc(f, frag[0], frag[0], pixel[0]);
      spe_andc(f, frag[1], frag[1], pixel[1]);
      spe_andc(f, frag[2], frag[2], pixel[2]);
      spe_andc(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_XOR:
      spe_xor(f, frag[0], frag[0], pixel[0]);
      spe_xor(f, frag[1], frag[1], pixel[1]);
      spe_xor(f, frag[2], frag[2], pixel[2]);
      spe_xor(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_NAND:
      spe_nand(f, frag[0], frag[0], pixel[0]);
      spe_nand(f, frag[1], frag[1], pixel[1]);
      spe_nand(f, frag[2], frag[2], pixel[2]);
      spe_nand(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_AND:
      spe_and(f, frag[0], frag[0], pixel[0]);
      spe_and(f, frag[1], frag[1], pixel[1]);
      spe_and(f, frag[2], frag[2], pixel[2]);
      spe_and(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_EQUIV:
      spe_eqv(f, frag[0], frag[0], pixel[0]);
      spe_eqv(f, frag[1], frag[1], pixel[1]);
      spe_eqv(f, frag[2], frag[2], pixel[2]);
      spe_eqv(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_OR_INVERTED:
      spe_orc(f, frag[0], pixel[0], frag[0]);
      spe_orc(f, frag[1], pixel[1], frag[1]);
      spe_orc(f, frag[2], pixel[2], frag[2]);
      spe_orc(f, frag[3], pixel[3], frag[3]);
      break;
   case PIPE_LOGICOP_COPY:
      break;
   case PIPE_LOGICOP_OR_REVERSE:
      spe_orc(f, frag[0], frag[0], pixel[0]);
      spe_orc(f, frag[1], frag[1], pixel[1]);
      spe_orc(f, frag[2], frag[2], pixel[2]);
      spe_orc(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_OR:
      spe_or(f, frag[0], frag[0], pixel[0]);
      spe_or(f, frag[1], frag[1], pixel[1]);
      spe_or(f, frag[2], frag[2], pixel[2]);
      spe_or(f, frag[3], frag[3], pixel[3]);
      break;
   case PIPE_LOGICOP_SET:
      spe_il(f, frag[0], ~0);
      spe_il(f, frag[1], ~0);
      spe_il(f, frag[2], ~0);
      spe_il(f, frag[3], ~0);
      break;

   /* These two cases are short-circuited above.
    */
   case PIPE_LOGICOP_INVERT:
   case PIPE_LOGICOP_NOOP:
   default:
      assert(0);
   }


   /* Apply fragment mask.
    */
   spe_ilh(f, tmp[0], 0x0000);
   spe_ilh(f, tmp[1], 0x0404);
   spe_ilh(f, tmp[2], 0x0808);
   spe_ilh(f, tmp[3], 0x0c0c);

   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
   spe_shufb(f, tmp[3], mask, mask, tmp[3]);

   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);

   spe_bi(f, 0, 0, 0);

#if 0
   {
      const uint32_t *p = f->store;
      unsigned i;

      printf("# %u instructions\n", f->csr - f->store);

      printf("\t.text\n");
      for (i = 0; i < 64; i++) {
         printf("\t.long\t0x%04x\n", p[i]);
      }
      fflush(stdout);
   }
#endif
}