/* * (C) Copyright IBM Corporation 2008 * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include "pipe/p_defines.h" #include "pipe/p_context.h" #include "pipe/p_format.h" #include "../auxiliary/draw/draw_context.h" #include "../auxiliary/draw/draw_private.h" #include "cell_context.h" #include "rtasm/rtasm_ppc_spe.h" /** * Emit a 4x4 matrix transpose operation * * \param p Function that the transpose operation is to be appended to * \param row0 Register containing row 0 of the source matrix * \param row1 Register containing row 1 of the source matrix * \param row2 Register containing row 2 of the source matrix * \param row3 Register containing row 3 of the source matrix * \param dest_ptr Register containing the address of the destination matrix * \param shuf_ptr Register containing the address of the shuffled data * \param count Number of colums to actually be written to the destination * * \note * This function assumes that the registers named by \c row0, \c row1, * \c row2, and \c row3 are scratch and can be modified by the generated code. * Furthermore, these registers will be released, via calls to * \c release_register, by this function. * * \note * This function requires that four temporary are available on entry. */ static void emit_matrix_transpose(struct spe_function *p, unsigned row0, unsigned row1, unsigned row2, unsigned row3, unsigned dest_ptr, unsigned shuf_ptr, unsigned count) { int shuf_hi = spe_allocate_available_register(p); int shuf_lo = spe_allocate_available_register(p); int t1 = spe_allocate_available_register(p); int t2 = spe_allocate_available_register(p); int t3; int t4; int col0; int col1; int col2; int col3; spe_lqd(p, shuf_hi, shuf_ptr, 3*16); spe_lqd(p, shuf_lo, shuf_ptr, 4*16); spe_shufb(p, t1, row0, row2, shuf_hi); spe_shufb(p, t2, row0, row2, shuf_lo); /* row0 and row2 are now no longer needed. Re-use those registers as * temporaries. */ t3 = row0; t4 = row2; spe_shufb(p, t3, row1, row3, shuf_hi); spe_shufb(p, t4, row1, row3, shuf_lo); /* row1 and row3 are now no longer needed. Re-use those registers as * temporaries. */ col0 = row1; col1 = row3; spe_shufb(p, col0, t1, t3, shuf_hi); if (count > 1) { spe_shufb(p, col1, t1, t3, shuf_lo); } /* t1 and t3 are now no longer needed. Re-use those registers as * temporaries. */ col2 = t1; col3 = t3; if (count > 2) { spe_shufb(p, col2, t2, t4, shuf_hi); } if (count > 3) { spe_shufb(p, col3, t2, t4, shuf_lo); } /* Store the results. Remember that the stqd instruction is encoded using * the qword offset (stand-alone assemblers to the byte-offset to * qword-offset conversion for you), so the byte-offset needs be divided by * 16. */ switch (count) { case 4: spe_stqd(p, col3, dest_ptr, 3 * 16); case 3: spe_stqd(p, col2, dest_ptr, 2 * 16); case 2: spe_stqd(p, col1, dest_ptr, 1 * 16); case 1: spe_stqd(p, col0, dest_ptr, 0 * 16); } /* Release all of the temporary registers used. */ spe_release_register(p, col0); spe_release_register(p, col1); spe_release_register(p, col2); spe_release_register(p, col3); spe_release_register(p, shuf_hi); spe_release_register(p, shuf_lo); spe_release_register(p, t2); spe_release_register(p, t4); } #if 0 /* This appears to not be used currently */ static void emit_fetch(struct spe_function *p, unsigned in_ptr, unsigned *offset, unsigned out_ptr, unsigned shuf_ptr, enum pipe_format format) { const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0) + (pf_size_z(format) != 0) + (pf_size_w(format) != 0); const unsigned type = pf_type(format); const unsigned bytes = pf_size_x(format); int v0 = spe_allocate_available_register(p); int v1 = spe_allocate_available_register(p); int v2 = spe_allocate_available_register(p); int v3 = spe_allocate_available_register(p); int tmp = spe_allocate_available_register(p); int float_zero = -1; int float_one = -1; float scale_signed = 0.0; float scale_unsigned = 0.0; spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16); spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16); spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16); spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16); offset[0] += 4; switch (bytes) { case 1: scale_signed = 1.0f / 127.0f; scale_unsigned = 1.0f / 255.0f; spe_lqd(p, tmp, shuf_ptr, 1 * 16); spe_shufb(p, v0, v0, v0, tmp); spe_shufb(p, v1, v1, v1, tmp); spe_shufb(p, v2, v2, v2, tmp); spe_shufb(p, v3, v3, v3, tmp); break; case 2: scale_signed = 1.0f / 32767.0f; scale_unsigned = 1.0f / 65535.0f; spe_lqd(p, tmp, shuf_ptr, 2 * 16); spe_shufb(p, v0, v0, v0, tmp); spe_shufb(p, v1, v1, v1, tmp); spe_shufb(p, v2, v2, v2, tmp); spe_shufb(p, v3, v3, v3, tmp); break; case 4: scale_signed = 1.0f / 2147483647.0f; scale_unsigned = 1.0f / 4294967295.0f; break; default: assert(0); break; } switch (type) { case PIPE_FORMAT_TYPE_FLOAT: break; case PIPE_FORMAT_TYPE_UNORM: spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16); spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff); spe_cuflt(p, v0, v0, 0); spe_fm(p, v0, v0, tmp); break; case PIPE_FORMAT_TYPE_SNORM: spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16); spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff); spe_csflt(p, v0, v0, 0); spe_fm(p, v0, v0, tmp); break; case PIPE_FORMAT_TYPE_USCALED: spe_cuflt(p, v0, v0, 0); break; case PIPE_FORMAT_TYPE_SSCALED: spe_csflt(p, v0, v0, 0); break; } if (count < 4) { float_one = spe_allocate_available_register(p); spe_il(p, float_one, 1); spe_cuflt(p, float_one, float_one, 0); if (count < 3) { float_zero = spe_allocate_available_register(p); spe_il(p, float_zero, 0); } } spe_release_register(p, tmp); emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count); switch (count) { case 1: spe_stqd(p, float_zero, out_ptr, 1 * 16); case 2: spe_stqd(p, float_zero, out_ptr, 2 * 16); case 3: spe_stqd(p, float_one, out_ptr, 3 * 16); } if (float_zero != -1) { spe_release_register(p, float_zero); } if (float_one != -1) { spe_release_register(p, float_one); } } #endif void cell_update_vertex_fetch(struct draw_context *draw) { #if 0 struct cell_context *const cell = (struct cell_context *) draw->driver_private; struct spe_function *p = &cell->attrib_fetch; unsigned function_index[PIPE_MAX_ATTRIBS]; unsigned unique_attr_formats; int out_ptr; int in_ptr; int shuf_ptr; unsigned i; unsigned j; /* Determine how many unique input attribute formats there are. At the * same time, store the index of the lowest numbered attribute that has * the same format as any non-unique format. */ unique_attr_formats = 1; function_index[0] = 0; for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) { const enum pipe_format curr_fmt = draw->vertex_element[i].src_format; for (j = 0; j < i; j++) { if (curr_fmt == draw->vertex_element[j].src_format) { break; } } if (j == i) { unique_attr_formats++; } function_index[i] = j; } /* Each fetch function can be a maximum of 34 instructions (note: this is * actually a slight over-estimate). */ spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats); /* Allocate registers for the function's input parameters. */ out_ptr = spe_allocate_register(p, 3); in_ptr = spe_allocate_register(p, 4); shuf_ptr = spe_allocate_register(p, 5); /* Generate code for the individual attribute fetch functions. */ for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) { unsigned offset; if (function_index[i] == i) { cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr - (void *) p->store); offset = 0; emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr, draw->vertex_element[i].src_format); spe_bi(p, 0, 0, 0); /* Round up to the next 16-byte boundary. */ if ((((unsigned) p->store) & 0x0f) != 0) { const unsigned align = ((unsigned) p->store) & 0x0f; p->store = (uint32_t *) (((void *) p->store) + align); } } else { /* Use the same function entry-point as a previously seen attribute * with the same format. */ cell->attrib_fetch_offsets[i] = cell->attrib_fetch_offsets[function_index[i]]; } } #else assert(0); #endif }