diff options
Diffstat (limited to 'src/gallium/drivers/cell/spu')
-rw-r--r-- | src/gallium/drivers/cell/spu/Makefile | 1 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_dcache.c | 125 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_dcache.h | 34 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_exec.c | 64 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_main.c | 22 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_main.h | 8 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_texture.c | 184 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_vertex_fetch.c | 76 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_vertex_shader.c | 68 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_vertex_shader.h | 5 |
10 files changed, 322 insertions, 265 deletions
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index 30ef2450ec..c071de1900 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -18,6 +18,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o SOURCES = \ spu_main.c \ spu_blend.c \ + spu_dcache.c \ spu_render.c \ spu_texture.c \ spu_tile.c \ diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c new file mode 100644 index 0000000000..a1701d80d1 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_dcache.c @@ -0,0 +1,125 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cell/common.h" +#include "spu_main.h" +#include "spu_dcache.h" + +#define CACHELINE_LOG2SIZE 7 +#define LINE_SIZE (1U << 7) +#define ALIGN_MASK (~(LINE_SIZE - 1)) + +#define CACHE_NAME data +#define CACHED_TYPE qword +#define CACHE_TYPE CACHE_TYPE_RO +#define CACHE_SET_TAGID(set) (((set) & 0x03) + TAG_DCACHE0) +#define CACHE_LOG2NNWAY 2 +#define CACHE_LOG2NSETS 6 +#include <cache-api.h> + +/* Yes folks, this is ugly. + */ +#undef CACHE_NWAY +#undef CACHE_NSETS +#define CACHE_NAME data +#define CACHE_NWAY 4 +#define CACHE_NSETS (1U << 6) + + +/** + * Fetch between arbitrary number of bytes from an unaligned address + * + * \param dst Destination data buffer + * \param ea Main memory effective address of source data + * \param size Number of bytes to read + * + * \warning + * As is hinted by the type of the \c dst pointer, this function writes + * multiples of 16-bytes. + */ +void +spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size) +{ + const int shift = ea & 0x0f; + const unsigned read_size = ROUNDUP16(size + shift); + const unsigned last_read = ROUNDUP16(ea + size); + const qword *const last_write = dst + (ROUNDUP16(size) / 16); + unsigned i; + + + if (shift == 0) { + /* Data is already aligned. Fetch directly into the destination buffer. + */ + for (i = 0; i < size; i += 16) { + *(dst++) = cache_rd(data, ea + i); + } + } else { + qword hi; + + + /* Please exercise extreme caution when modifying this code. This code + * must not read past the end of the page containing the source data, + * and it must not write more than ((size + 15) / 16) qwords to the + * destination buffer. + */ + ea &= ~0x0f; + hi = cache_rd(data, ea); + for (i = 16; i < read_size; i += 16) { + qword lo = cache_rd(data, ea + i); + + *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), + (qword) spu_rlmaskqwbyte(lo, shift - 16)); + hi = lo; + } + + if (dst != last_write) { + *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0)); + } + } + + ASSERT((ea + i) == last_read); + ASSERT(dst == last_write); +} + + +/** + * Notify the cache that a range of main memory may have been modified + */ +void +spu_dcache_mark_dirty(unsigned ea, unsigned size) +{ + unsigned i; + const unsigned aligned_start = (ea & ALIGN_MASK); + const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) + & ALIGN_MASK; + + + for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) { + const unsigned entry = __cache_dir[i]; + const unsigned addr = entry & ~0x0f; + + __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end)) + ? (entry & ~CACHELINE_VALID) : entry; + } +} diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h new file mode 100644 index 0000000000..7a06b8c25a --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_dcache.h @@ -0,0 +1,34 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SPU_DCACHE_H +#define SPU_DCACHE_H + +extern void +spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size); + +extern void +spu_dcache_mark_dirty(unsigned ea, unsigned size); + +#endif /* SPU_DCACHE_H */ diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c index 109540b1f7..1560c0f157 100644 --- a/src/gallium/drivers/cell/spu/spu_exec.c +++ b/src/gallium/drivers/cell/spu/spu_exec.c @@ -50,8 +50,6 @@ * Brian Paul */ -#include <libmisc.h> -#include <spu_mfcio.h> #include <transpose_matrix4x4.h> #include <simdmath/ceilf4.h> #include <simdmath/cosf4.h> @@ -72,6 +70,8 @@ #include "spu_exec.h" #include "spu_main.h" #include "spu_vertex_shader.h" +#include "spu_dcache.h" +#include "cell/common.h" #define TILE_TOP_LEFT 0 #define TILE_TOP_RIGHT 1 @@ -146,17 +146,14 @@ spu_exec_machine_init(struct spu_exec_machine *mach, struct spu_sampler *samplers, unsigned processor) { - qword zero; - qword not_zero; - uint i; + const qword zero = si_il(0); + const qword not_zero = si_il(~0); + (void) numSamplers; mach->Samplers = samplers; mach->Processor = processor; mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS]; - zero = si_xor(zero, zero); - not_zero = si_xori(zero, 0xff); - /* Setup constants. */ mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero; mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero; @@ -356,19 +353,17 @@ fetch_src_file_channel( case TGSI_EXTSWIZZLE_W: switch( file ) { case TGSI_FILE_CONSTANT: { - unsigned char buffer[32] ALIGN16_ATTRIB; unsigned i; for (i = 0; i < 4; i++) { const float *ptr = mach->Consts[index->i[i]]; - const uint64_t addr = (uint64_t)(uintptr_t) ptr; - const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32; + float tmp[4]; - mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0); - wait_on_mask(1 << TAG_VERTEX_BUFFER); + spu_dcache_fetch_unaligned((qword *) tmp, + (uintptr_t)(ptr + swizzle), + sizeof(float)); - (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) - + (sizeof(float) * swizzle)], sizeof(float)); + chan->f[i] = tmp[0]; } break; } @@ -663,9 +658,10 @@ fetch_texel( struct spu_sampler *sampler, qword rgba[4]; qword out[4]; - sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba); + sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, + (float (*)[4]) rgba); - _transpose_matrix4x4(out, rgba); + _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba); r->q = out[0]; g->q = out[1]; b->q = out[2]; @@ -1903,32 +1899,28 @@ spu_exec_machine_run( struct spu_exec_machine *mach ) /* execute declarations (interpolants) */ if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { for (i = 0; i < mach->NumDeclarations; i++) { - uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB; - struct tgsi_full_declaration decl; - unsigned long decl_addr = (unsigned long) (mach->Declarations+i); - unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f); + union { + struct tgsi_full_declaration decl; + qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16]; + } d ALIGN16_ATTRIB; + unsigned ea = (unsigned) (mach->Declarations + pc); - mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0); - wait_on_mask(1 << TAG_INSTRUCTION_FETCH); + spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl)); - memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl)); - exec_declaration( mach, &decl ); + exec_declaration( mach, &d.decl ); } } /* execute instructions, until pc is set to -1 */ while (pc != -1) { - uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB; - struct tgsi_full_instruction inst; - unsigned long inst_addr = (unsigned long) (mach->Instructions + pc); - unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f); - - assert(pc < mach->NumInstructions); - mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0); - wait_on_mask(1 << TAG_INSTRUCTION_FETCH); - - memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst)); - exec_instruction( mach, & inst, &pc ); + union { + struct tgsi_full_instruction inst; + qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16]; + } i ALIGN16_ATTRIB; + unsigned ea = (unsigned) (mach->Instructions + pc); + + spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst)); + exec_instruction( mach, & i.inst, &pc ); } #if 0 diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index fcbf0f841e..59300028d4 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -38,6 +38,7 @@ #include "spu_tile.h" //#include "spu_test.h" #include "spu_vertex_shader.h" +#include "spu_dcache.h" #include "cell/common.h" #include "pipe/p_defines.h" @@ -285,6 +286,8 @@ cmd_state_texture(const struct cell_command_texture *texture) { spu.texture.width, spu.texture.height, 0.0, 0.0}; spu.tex_size_mask = (vector unsigned int) { spu.texture.width - 1, spu.texture.height - 1, 0, 0 }; + spu.tex_size_x_mask = spu_splats(spu.texture.width - 1); + spu.tex_size_y_mask = spu_splats(spu.texture.height - 1); } @@ -433,10 +436,19 @@ cmd_batch(uint opcode) sizeof(struct pipe_viewport_state)); pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8); break; + case CELL_CMD_STATE_UNIFORMS: + draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1]; + pos += 2; + break; case CELL_CMD_STATE_VS_ARRAY_INFO: cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); break; + case CELL_CMD_STATE_BIND_VS: + spu_bind_vertex_shader(&draw, + (struct cell_shader_info *) &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); + break; case CELL_CMD_STATE_ATTRIB_FETCH: { struct cell_attribute_fetch_code *code = (struct cell_attribute_fetch_code *) &buffer[pos+1]; @@ -453,6 +465,14 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); break; } + case CELL_CMD_FLUSH_BUFFER_RANGE: { + struct cell_buffer_range *br = (struct cell_buffer_range *) + &buffer[pos+1]; + + spu_dcache_mark_dirty((unsigned) br->base, br->size); + pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8); + break; + } default: printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]); ASSERT(0); @@ -566,7 +586,7 @@ main(main_param_t speid, main_param_t argp) one_time_init(); if (Debug) - printf("SPU: main() speid=%lu\n", speid); + printf("SPU: main() speid=%lu\n", (unsigned long) speid); mfc_get(&spu.init, /* dest */ (unsigned int) argp, /* src */ diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 5c95d112ac..a13edd1702 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -107,6 +107,8 @@ struct spu_global vector float tex_size; vector unsigned int tex_size_mask; /**< == int(size - 1) */ + vector unsigned int tex_size_x_mask; /**< == int(size - 1) */ + vector unsigned int tex_size_y_mask; /**< == int(size - 1) */ vector float (*sample_texture)(vector float texcoord); @@ -130,8 +132,10 @@ extern boolean Debug; #define TAG_INDEX_BUFFER 16 #define TAG_BATCH_BUFFER 17 #define TAG_MISC 18 -#define TAG_TEXTURE_TILE 19 -#define TAG_INSTRUCTION_FETCH 20 +#define TAG_DCACHE0 20 +#define TAG_DCACHE1 21 +#define TAG_DCACHE2 22 +#define TAG_DCACHE3 23 diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 3962aaa4a9..67eb08196a 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -31,19 +31,7 @@ #include "spu_texture.h" #include "spu_tile.h" #include "spu_colorpack.h" - - -/** - * Number of texture tiles to cache. - * Note that this will probably be the largest consumer of SPU local store/ - * memory for this driver! - */ -#define CACHE_SIZE 16 - -static tile_t tex_tiles[CACHE_SIZE] ALIGN16_ATTRIB; - -static vector unsigned int tex_tile_xy[CACHE_SIZE]; - +#include "spu_dcache.h" /** @@ -52,78 +40,60 @@ static vector unsigned int tex_tile_xy[CACHE_SIZE]; void invalidate_tex_cache(void) { - /* XXX memset? */ - uint i; - for (i = 0; i < CACHE_SIZE; i++) { - tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U }); - } + spu_dcache_mark_dirty((unsigned) spu.texture.start, + 4 * spu.texture.width * spu.texture.height); } -/** - * Return the cache pos/index which corresponds to tile (tx,ty) - */ -static INLINE uint -cache_pos(vector unsigned int txty) +static uint +get_texel(vec_uint4 coordinate) { - uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE; - return pos; + vec_uint4 tmp; + unsigned x = spu_extract(coordinate, 0); + unsigned y = spu_extract(coordinate, 1); + const unsigned tiles_per_row = spu.texture.width / TILE_SIZE; + unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) + + (x / TILE_SIZE)); + unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE) + + (x % TILE_SIZE)); + + spu_dcache_fetch_unaligned((qword *) & tmp, + spu.texture.start + tile_offset + texel_offset, + 4); + return spu_extract(tmp, 0); } -/** - * Make sure the tile for texel (i,j) is present, return its position/index - * in the cache. - */ -static uint -get_tex_tile(vector unsigned int ij) +static void +get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels) { - /* tile address: tx,ty */ - const vector unsigned int txty = spu_rlmask(ij, -5); /* divide by 32 */ - const uint pos = cache_pos(txty); - - if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) || - (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) { - - /* texture cache miss, fetch tile from main memory */ - const uint tiles_per_row = spu.texture.width / TILE_SIZE; - const uint bytes_per_tile = sizeof(tile_t); - const void *src = (const ubyte *) spu.texture.start - + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile; - - printf("SPU %u: tex cache miss at %d, %d pos=%u old=%d,%d\n", - spu.init.id, - spu_extract(txty,0), - spu_extract(txty,1), - pos, - spu_extract(tex_tile_xy[pos],0), - spu_extract(tex_tile_xy[pos],1)); - - ASSERT_ALIGN16(tex_tiles[pos].ui); - ASSERT_ALIGN16(src); - - mfc_get(tex_tiles[pos].ui, /* dest */ - (unsigned int) src, - bytes_per_tile, /* size */ - TAG_TEXTURE_TILE, - 0, /* tid */ - 0 /* rid */); - - wait_on_mask(1 << TAG_TEXTURE_TILE); - - tex_tile_xy[pos] = txty; - } - else { -#if 0 - printf("SPU %u: tex cache HIT at %d, %d\n", - spu.init.id, tx, ty); -#endif - } - - return pos; + const unsigned texture_ea = (uintptr_t) spu.texture.start; + vec_uint4 tile_x = spu_rlmask(x, -5); + vec_uint4 tile_y = spu_rlmask(y, -5); + const qword offset_x = si_andi((qword) x, 0x1f); + const qword offset_y = si_andi((qword) y, 0x1f); + + const qword tiles_per_row = (qword) spu_splats(spu.texture.width / TILE_SIZE); + const qword tile_size = (qword) spu_splats(sizeof(tile_t)); + + qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x); + tile_offset = si_mpy((qword) tile_offset, tile_size); + + qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x); + texel_offset = si_mpyui(texel_offset, 4); + + vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset); + + spu_dcache_fetch_unaligned((qword *) & texels[0], + texture_ea + spu_extract(offset, 0), 4); + spu_dcache_fetch_unaligned((qword *) & texels[1], + texture_ea + spu_extract(offset, 1), 4); + spu_dcache_fetch_unaligned((qword *) & texels[2], + texture_ea + spu_extract(offset, 2), 4); + spu_dcache_fetch_unaligned((qword *) & texels[3], + texture_ea + spu_extract(offset, 3), 4); } - /** * Get texture sample at texcoord. * XXX this is extremely primitive for now. @@ -134,9 +104,7 @@ sample_texture_nearest(vector float texcoord) vector float tc = spu_mul(texcoord, spu.tex_size); vector unsigned int itc = spu_convtu(tc, 0); /* convert to int */ itc = spu_and(itc, spu.tex_size_mask); /* mask (GL_REPEAT) */ - vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */ - uint pos = get_tex_tile(itc); - uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)]; + uint texel = get_texel(itc); return spu_unpack_A8R8G8B8(texel); } @@ -144,49 +112,33 @@ sample_texture_nearest(vector float texcoord) vector float sample_texture_bilinear(vector float texcoord) { - static const vector unsigned int offset10 = {1, 0, 0, 0}; - static const vector unsigned int offset01 = {0, 1, 0, 0}; + static const vec_uint4 offset_x = {0, 0, 1, 1}; + static const vec_uint4 offset_y = {0, 1, 0, 1}; vector float tc = spu_mul(texcoord, spu.tex_size); tc = spu_add(tc, spu_splats(-0.5f)); /* half texel bias */ /* integer texcoords S,T: */ - vector unsigned int itc00 = spu_convtu(tc, 0); /* convert to int */ - vector unsigned int itc01 = spu_add(itc00, offset01); - vector unsigned int itc10 = spu_add(itc00, offset10); - vector unsigned int itc11 = spu_add(itc10, offset01); - - /* mask (GL_REPEAT) */ - itc00 = spu_and(itc00, spu.tex_size_mask); - itc01 = spu_and(itc01, spu.tex_size_mask); - itc10 = spu_and(itc10, spu.tex_size_mask); - itc11 = spu_and(itc11, spu.tex_size_mask); - - /* intra tile addr */ - vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1); - vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1); - vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1); - vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1); - - /* get tile cache positions */ - uint pos00 = get_tex_tile(itc00); - uint pos01, pos10, pos11; - if ((spu_extract(ij00, 0) < TILE_SIZE-1) && - (spu_extract(ij00, 1) < TILE_SIZE-1)) { - /* all texels are in the same tile */ - pos01 = pos10 = pos11 = pos00; - } - else { - pos01 = get_tex_tile(itc01); - pos10 = get_tex_tile(itc10); - pos11 = get_tex_tile(itc11); - } - - /* get texels from tiles and convert to float[4] */ - vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]); - vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]); - vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]); - vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]); + vec_uint4 itc = spu_convtu(tc, 0); /* convert to int */ + + vec_uint4 texels[4]; + + vec_uint4 x = spu_splats(spu_extract(itc, 0)); + vec_uint4 y = spu_splats(spu_extract(itc, 1)); + + x = spu_add(x, offset_x); + y = spu_add(y, offset_y); + + x = spu_and(x, spu.tex_size_x_mask); + y = spu_and(y, spu.tex_size_y_mask); + + get_four_texels(x, y, texels); + + vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0)); + vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0)); + vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0)); + vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0)); + /* Compute weighting factors in [0,1] * Multiply texcoord by 1024, AND with 1023, convert back to float. diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c index 55c6c28717..219fd90cc0 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c @@ -32,39 +32,19 @@ * Ian Romanick <idr@us.ibm.com> */ -#include <spu_mfcio.h> - #include "pipe/p_util.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" #include "spu_exec.h" #include "spu_vertex_shader.h" #include "spu_main.h" - -#define CACHE_NAME attribute -#define CACHED_TYPE qword -#define CACHE_TYPE CACHE_TYPE_RO -#define CACHE_SET_TAGID(set) TAG_VERTEX_BUFFER -#define CACHE_LOG2NNWAY 2 -#define CACHE_LOG2NSETS 6 -#include <cache-api.h> - -/* Yes folks, this is ugly. - */ -#undef CACHE_NWAY -#undef CACHE_NSETS -#define CACHE_NAME attribute -#define CACHE_NWAY 4 -#define CACHE_NSETS (1U << 6) - - -#define DRAW_DBG 0 +#include "spu_dcache.h" typedef void (*spu_fetch_func)(qword *out, const qword *in, const qword *shuffle_data); -static const qword fetch_shuffle_data[] = { +static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = { /* Shuffle used by CVT_64_FLOAT */ { @@ -103,44 +83,6 @@ static const qword fetch_shuffle_data[] = { /** - * Fetch between 1 and 32 bytes from an unaligned address - */ -static INLINE void -fetch_unaligned(qword *dst, unsigned ea, unsigned size) -{ - qword tmp[4]; - const int shift = ea & 0x0f; - const unsigned aligned_start_ea = ea & ~0x0f; - const unsigned aligned_end_ea = (ea + size) & ~0x0f; - const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1; - unsigned i; - - - if (shift == 0) { - /* Data is already aligned. Fetch directly into the destination buffer. - */ - for (i = 0; i < num_entries; i++) { - dst[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16)); - } - } else { - /* Fetch data from the cache to the local buffer. - */ - for (i = 0; i < num_entries; i++) { - tmp[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16)); - } - - - /* Fix the alignment of the data and write to the destination buffer. - */ - for (i = 0; i < ((size + 15) / 16); i++) { - dst[i] = si_or((qword) spu_slqwbyte(tmp[i], shift), - (qword) spu_rlmaskqwbyte(tmp[i + 1], shift - 16)); - } - } -} - - -/** * Fetch vertex attributes for 'count' vertices. */ static void generic_vertex_fetch(struct spu_vs_context *draw, @@ -169,7 +111,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, unsigned idx; const unsigned bytes_per_entry = draw->vertex_fetch.size[attr]; const unsigned quads_per_entry = (bytes_per_entry + 15) / 16; - qword in[2 * 4]; + qword in[2 * 4] ALIGN16_ATTRIB; /* Fetch four attributes for four vertices. @@ -182,7 +124,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, printf("SPU: fetching = 0x%llx\n", addr); #endif - fetch_unaligned(& in[idx], addr, bytes_per_entry); + spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry); idx += quads_per_entry; } @@ -200,15 +142,5 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, void spu_update_vertex_fetch( struct spu_vs_context *draw ) { - unsigned i; - - - /* Invalidate the vertex cache. - */ - for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) { - CACHELINE_CLEARVALID(i); - } - - draw->vertex_fetch.fetch_func = generic_vertex_fetch; } diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c index 3f5bf41aa2..8363efeeb6 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c @@ -165,63 +165,55 @@ run_vertex_program(struct spu_vs_context *draw, } -static void -spu_bind_vertex_shader(struct spu_vs_context *draw, - void *uniforms, - void *planes, - unsigned nr_planes, - unsigned num_outputs - ) -{ - draw->constants = (float (*)[4]) uniforms; - - (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes); - draw->nr_planes = nr_planes; - draw->num_vs_outputs = num_outputs; - - /* specify the shader to interpret/execute */ - spu_exec_machine_init(&draw->machine, - PIPE_MAX_SAMPLERS, - NULL /*samplers*/, - PIPE_SHADER_VERTEX); -} - - unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32] ALIGN16_ATTRIB; + void -spu_execute_vertex_shader(struct spu_vs_context *draw, - const struct cell_command_vs *vs) +spu_bind_vertex_shader(struct spu_vs_context *draw, + struct cell_shader_info *vs) { - unsigned i; - - const uint64_t immediate_addr = vs->shader.immediates; + const unsigned immediate_addr = vs->immediates; const unsigned immediate_size = - ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates) - + (immediate_addr & 0x0f)); + ROUNDUP16((sizeof(float) * 4 * vs->num_immediates) + + (immediate_addr & 0x0f)); + mfc_get(immediates, immediate_addr & ~0x0f, immediate_size, TAG_VERTEX_BUFFER, 0, 0); draw->machine.Instructions = (struct tgsi_full_instruction *) - vs->shader.instructions; - draw->machine.NumInstructions = vs->shader.num_instructions; + vs->instructions; + draw->machine.NumInstructions = vs->num_instructions; draw->machine.Declarations = (struct tgsi_full_declaration *) - vs->shader.declarations; - draw->machine.NumDeclarations = vs->shader.num_declarations; + vs->declarations; + draw->machine.NumDeclarations = vs->num_declarations; - draw->vertex_fetch.nr_attrs = vs->nr_attrs; + draw->num_vs_outputs = vs->num_outputs; + + /* specify the shader to interpret/execute */ + spu_exec_machine_init(&draw->machine, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/, + PIPE_SHADER_VERTEX); wait_on_mask(1 << TAG_VERTEX_BUFFER); (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f], - sizeof(float) * 4 * vs->shader.num_immediates); + sizeof(float) * 4 * vs->num_immediates); +} - spu_bind_vertex_shader(draw, vs->shader.uniforms, - vs->plane, vs->nr_planes, - vs->shader.num_outputs); + +void +spu_execute_vertex_shader(struct spu_vs_context *draw, + const struct cell_command_vs *vs) +{ + unsigned i; + + (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes); + draw->nr_planes = vs->nr_planes; + draw->vertex_fetch.nr_attrs = vs->nr_attrs; for (i = 0; i < vs->num_elts; i += 4) { const unsigned batch_size = MIN2(vs->num_elts - i, 4); diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h index 0fb0bc28d0..54a4b8d9b9 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h @@ -1,6 +1,7 @@ #ifndef SPU_VERTEX_SHADER_H #define SPU_VERTEX_SHADER_H +#include "cell/common.h" #include "pipe/p_format.h" #include "spu_exec.h" @@ -55,6 +56,10 @@ static INLINE void spu_vertex_fetch(struct spu_vs_context *draw, struct cell_command_vs; extern void +spu_bind_vertex_shader(struct spu_vs_context *draw, + struct cell_shader_info *vs); + +extern void spu_execute_vertex_shader(struct spu_vs_context *draw, const struct cell_command_vs *vs); |