From b642730be93149baa7556e5791393168ab396175 Mon Sep 17 00:00:00 2001 From: José Fonseca Date: Fri, 15 Feb 2008 17:35:24 +0900 Subject: Code reorganization: move files into their places. This is in a separate commit to ensure renames are properly preserved. --- src/gallium/drivers/cell/common.h | 220 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 src/gallium/drivers/cell/common.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h new file mode 100644 index 0000000000..4de514c358 --- /dev/null +++ b/src/gallium/drivers/cell/common.h @@ -0,0 +1,220 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Types and tokens which are common to the SPU and PPU code. + */ + + +#ifndef CELL_COMMON_H +#define CELL_COMMON_H + +#include "pipe/p_compiler.h" +#include "pipe/p_util.h" +#include "pipe/p_format.h" + + +/** The standard assert macro doesn't seem to work reliably */ +#define ASSERT(x) \ + if (!(x)) { \ + ubyte *p = NULL; \ + fprintf(stderr, "%s:%d: %s(): assertion %s failed.\n", \ + __FILE__, __LINE__, __FUNCTION__, #x); \ + *p = 0; \ + exit(1); \ + } + + +/** for sanity checking */ +#define ASSERT_ALIGN16(ptr) \ + ASSERT((((unsigned long) (ptr)) & 0xf) == 0); + + +/** round up value to next multiple of 4 */ +#define ROUNDUP4(k) (((k) + 0x3) & ~0x3) + +/** round up value to next multiple of 8 */ +#define ROUNDUP8(k) (((k) + 0x7) & ~0x7) + +/** round up value to next multiple of 16 */ +#define ROUNDUP16(k) (((k) + 0xf) & ~0xf) + + +#define CELL_MAX_SPUS 6 + +#define TILE_SIZE 32 + + +/** + * The low byte of a mailbox word contains the command opcode. + * Remaining higher bytes are command specific. + */ +#define CELL_CMD_OPCODE_MASK 0xff + +#define CELL_CMD_EXIT 1 +#define CELL_CMD_CLEAR_SURFACE 2 +#define CELL_CMD_FINISH 3 +#define CELL_CMD_RENDER 4 +#define CELL_CMD_BATCH 5 +#define CELL_CMD_RELEASE_VERTS 6 +#define CELL_CMD_STATE_FRAMEBUFFER 10 +#define CELL_CMD_STATE_DEPTH_STENCIL 11 +#define CELL_CMD_STATE_SAMPLER 12 +#define CELL_CMD_STATE_TEXTURE 13 +#define CELL_CMD_STATE_VERTEX_INFO 14 +#define CELL_CMD_STATE_VIEWPORT 15 +#define CELL_CMD_STATE_VS_ARRAY_INFO 16 +#define CELL_CMD_STATE_BLEND 17 +#define CELL_CMD_VS_EXECUTE 18 + + +#define CELL_NUM_BUFFERS 4 +#define CELL_BUFFER_SIZE (4*1024) /**< 16KB would be the max */ + +#define CELL_BUFFER_STATUS_FREE 10 +#define CELL_BUFFER_STATUS_USED 20 + + + +/** + * Tell SPUs about the framebuffer size, location + */ +struct cell_command_framebuffer +{ + uint64_t opcode; /**< CELL_CMD_FRAMEBUFFER */ + int width, height; + void *color_start, *depth_start; + enum pipe_format color_format, depth_format; +}; + + +/** + * Clear framebuffer to the given value/color. + */ +struct cell_command_clear_surface +{ + uint64_t opcode; /**< CELL_CMD_CLEAR_SURFACE */ + uint surface; /**< Temporary: 0=color, 1=Z */ + uint value; +}; + + +/** + * Array info used by the vertex shader's vertex puller. + */ +struct cell_array_info +{ + uint64_t base; /**< Base address of the 0th element. */ + uint attr; /**< Attribute that this state is for. */ + uint pitch; /**< Byte pitch from one entry to the next. */ + uint format; /**< Pipe format of each entry. */ +} ALIGN16_ATTRIB; + + +struct cell_shader_info +{ + unsigned num_outputs; + + uint64_t declarations; + unsigned num_declarations; + uint64_t instructions; + unsigned num_instructions; + uint64_t uniforms; + uint64_t immediates; + unsigned num_immediates; +} ALIGN16_ATTRIB; + + +#define SPU_VERTS_PER_BATCH 64 +struct cell_command_vs +{ + uint64_t opcode; /**< CELL_CMD_VS_EXECUTE */ + struct cell_shader_info shader; + unsigned num_elts; + unsigned elts[SPU_VERTS_PER_BATCH]; + uint64_t vOut[SPU_VERTS_PER_BATCH]; + float plane[12][4]; + unsigned nr_planes; + unsigned nr_attrs; +} ALIGN16_ATTRIB; + + +struct cell_command_render +{ + uint64_t opcode; /**< CELL_CMD_RENDER */ + uint prim_type; /**< PIPE_PRIM_x */ + uint num_verts; + uint vertex_size; /**< bytes per vertex */ + uint num_indexes; + uint vertex_buf; /**< which cell->buffer[] contains the vertex data */ + float xmin, ymin, xmax, ymax; /* XXX another dummy field */ + uint min_index; + boolean inline_verts; +}; + + +struct cell_command_release_verts +{ + uint64_t opcode; /**< CELL_CMD_RELEASE_VERTS */ + uint vertex_buf; /**< in [0, CELL_NUM_BUFFERS-1] */ +}; + + +struct cell_command_texture +{ + void *start; /**< Address in main memory */ + uint width, height; +}; + + +/** XXX unions don't seem to work */ +/* XXX this should go away; all commands should be placed in batch buffers */ +struct cell_command +{ +#if 0 + struct cell_command_framebuffer fb; + struct cell_command_clear_surface clear; + struct cell_command_render render; +#endif + struct cell_command_vs vs; +} ALIGN16_ATTRIB; + + +/** This is the object passed to spe_create_thread() */ +struct cell_init_info +{ + unsigned id; + unsigned num_spus; + struct cell_command *cmd; + + /** Buffers for command batches, vertex/index data */ + ubyte *buffers[CELL_NUM_BUFFERS]; + uint *buffer_status; /**< points at cell_context->buffer_status */ +} ALIGN16_ATTRIB; + + +#endif /* CELL_COMMON_H */ -- cgit v1.2.3 From 3320b1874e810583f95b93a89697b2955987b84f Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Fri, 15 Feb 2008 11:03:54 -0800 Subject: Cell: Enable code gen for SPE attribute fetch Doubles are still unsupported. --- src/gallium/drivers/cell/common.h | 15 +- src/gallium/drivers/cell/ppu/Makefile | 1 + src/gallium/drivers/cell/ppu/cell_context.h | 4 + src/gallium/drivers/cell/ppu/cell_vertex_fetch.c | 15 +- src/gallium/drivers/cell/ppu/cell_vertex_shader.c | 21 +- src/gallium/drivers/cell/spu/spu_main.c | 22 +- src/gallium/drivers/cell/spu/spu_vertex_fetch.c | 477 +--------------------- src/gallium/drivers/cell/spu/spu_vertex_shader.h | 6 +- 8 files changed, 71 insertions(+), 490 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 4de514c358..74b131fbef 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -90,6 +90,7 @@ #define CELL_CMD_STATE_VS_ARRAY_INFO 16 #define CELL_CMD_STATE_BLEND 17 #define CELL_CMD_VS_EXECUTE 18 +#define CELL_CMD_STATE_ATTRIB_FETCH 19 #define CELL_NUM_BUFFERS 4 @@ -128,13 +129,19 @@ struct cell_command_clear_surface */ struct cell_array_info { - uint64_t base; /**< Base address of the 0th element. */ - uint attr; /**< Attribute that this state is for. */ - uint pitch; /**< Byte pitch from one entry to the next. */ - uint format; /**< Pipe format of each entry. */ + uint64_t base; /**< Base address of the 0th element. */ + uint attr; /**< Attribute that this state is for. */ + uint pitch; /**< Byte pitch from one entry to the next. */ + uint size; + uint function_offset; } ALIGN16_ATTRIB; +struct cell_attribute_fetch_code { + uint64_t base; + uint size; +}; + struct cell_shader_info { unsigned num_outputs; diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index a4c3f29e8a..196ab777f5 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -34,6 +34,7 @@ SOURCES = \ cell_surface.c \ cell_texture.c \ cell_vbuf.c \ + cell_vertex_fetch.c \ cell_vertex_shader.c \ cell_winsys.c diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 6196c0c72f..91f8e542a2 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -36,6 +36,7 @@ #include "draw/draw_vbuf.h" #include "cell_winsys.h" #include "cell/common.h" +#include "ppc/rtasm/spe_asm.h" struct cell_vbuf_render; @@ -111,6 +112,9 @@ struct cell_context /** [4] to ensure 16-byte alignment for each status word */ uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB; + + struct spe_function attrib_fetch; + unsigned attrib_fetch_offsets[PIPE_ATTRIB_MAX]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c index f2432f4ff5..f10689a959 100644 --- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c +++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c @@ -27,10 +27,10 @@ #include "pipe/p_context.h" #include "pipe/p_format.h" -#include "pipe/draw/draw_context.h" -#include "pipe/draw/draw_private.h" +#include "../auxiliary/draw/draw_context.h" +#include "../auxiliary/draw/draw_private.h" -#include "pipe/cell/ppu/cell_context.h" +#include "cell_context.h" #include "ppc/rtasm/spe_asm.h" typedef uint64_t register_mask; @@ -380,13 +380,4 @@ void cell_update_vertex_fetch(struct draw_context *draw) cell->attrib_fetch_offsets[function_index[i]]; } } - - static first_time = 1; - if (first_time) { - first_time = 0; - const unsigned instructions = p->csr - p->store; - for (i = 0; i < instructions; i++) { - printf("\t.long\t0x%08x\n", p->store[i]); - } - } } diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c index 0ba4506505..6a1d3bc20a 100644 --- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c +++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c @@ -55,14 +55,32 @@ cell_vertex_shader_queue_flush(struct draw_context *draw) uint64_t *batch; struct cell_array_info *array_info; unsigned i, j; + struct cell_attribute_fetch_code *cf; assert(draw->vs.queue_nr != 0); /* XXX: do this on statechange: */ draw_update_vertex_fetch(draw); + cell_update_vertex_fetch(draw); + + + batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf)); + batch[0] = CELL_CMD_STATE_ATTRIB_FETCH; + cf = (struct cell_attribute_fetch_code *) (&batch[1]); + cf->base = cell->attrib_fetch.store; + cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr + - (void *) cell->attrib_fetch.store)); + for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) { + const enum pipe_format format = draw->vertex_element[i].src_format; + const unsigned count = ((pf_size_x(format) != 0) + + (pf_size_y(format) != 0) + + (pf_size_z(format) != 0) + + (pf_size_w(format) != 0)); + const unsigned size = pf_size_x(format) * count; + batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info)); batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO; @@ -72,7 +90,8 @@ cell_vertex_shader_queue_flush(struct draw_context *draw) array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i]; array_info->attr = i; array_info->pitch = draw->vertex_fetch.pitch[i]; - array_info->format = draw->vertex_element[i].src_format; + array_info->size = size; + array_info->function_offset = cell->attrib_fetch_offsets[i]; } batch = cell_batch_alloc(cell, sizeof(batch[0]) diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 1e7243b863..fcbf0f841e 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -54,6 +54,9 @@ struct spu_global spu; struct spu_vs_context draw; +static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX] + ALIGN16_ATTRIB; + /** * Tell the PPU that this SPU has finished copying a buffer to * local store and that it may be reused by the PPU. @@ -306,7 +309,8 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info) ASSERT(attr < PIPE_ATTRIB_MAX); draw.vertex_fetch.src_ptr[attr] = vs_info->base; draw.vertex_fetch.pitch[attr] = vs_info->pitch; - draw.vertex_fetch.format[attr] = vs_info->format; + draw.vertex_fetch.size[attr] = vs_info->size; + draw.vertex_fetch.code_offset[attr] = vs_info->function_offset; draw.vertex_fetch.dirty = 1; } @@ -433,6 +437,22 @@ cmd_batch(uint opcode) cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); break; + case CELL_CMD_STATE_ATTRIB_FETCH: { + struct cell_attribute_fetch_code *code = + (struct cell_attribute_fetch_code *) &buffer[pos+1]; + + mfc_get(attribute_fetch_code_buffer, + (unsigned int) code->base, /* src */ + code->size, + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); + + draw.vertex_fetch.code = attribute_fetch_code_buffer; + pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); + break; + } default: printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]); ASSERT(0); diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c index 45e3c26c00..55c6c28717 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c @@ -1,6 +1,7 @@ /************************************************************************** * * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * (C) Copyright IBM Corporation 2008 * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -28,10 +29,10 @@ /* * Authors: * Keith Whitwell + * Ian Romanick */ #include -#include #include "pipe/p_util.h" #include "pipe/p_state.h" @@ -59,6 +60,10 @@ #define DRAW_DBG 0 +typedef void (*spu_fetch_func)(qword *out, const qword *in, + const qword *shuffle_data); + + static const qword fetch_shuffle_data[] = { /* Shuffle used by CVT_64_FLOAT */ @@ -97,22 +102,6 @@ static const qword fetch_shuffle_data[] = { }; -static INLINE void -trans4x4(qword row0, qword row1, qword row2, qword row3, qword *out, - const qword *shuffle) -{ - qword t1 = si_shufb(row0, row2, shuffle[3]); - qword t2 = si_shufb(row0, row2, shuffle[4]); - qword t3 = si_shufb(row1, row3, shuffle[3]); - qword t4 = si_shufb(row1, row3, shuffle[4]); - - out[0] = si_shufb(t1, t3, shuffle[3]); - out[1] = si_shufb(t1, t3, shuffle[4]); - out[2] = si_shufb(t2, t4, shuffle[3]); - out[3] = si_shufb(t2, t4, shuffle[4]); -} - - /** * Fetch between 1 and 32 bytes from an unaligned address */ @@ -151,446 +140,6 @@ fetch_unaligned(qword *dst, unsigned ea, unsigned size) } -#define CVT_32_FLOAT(q, s) (*(q)) - -static INLINE qword -CVT_64_FLOAT(const qword *qw, const qword *shuffle) -{ - qword a = si_frds(qw[0]); - qword b = si_frds(si_rotqbyi(qw[0], 8)); - qword c = si_frds(qw[1]); - qword d = si_frds(si_rotqbyi(qw[1], 8)); - - qword ab = si_shufb(a, b, shuffle[0]); - qword cd = si_shufb(c, d, si_rotqbyi(shuffle[0], 8)); - - return si_or(ab, cd); -} - - -static INLINE qword -CVT_8_USCALED(const qword *qw, const qword *shuffle) -{ - return si_cuflt(si_shufb(*qw, *qw, shuffle[1]), 0); -} - - -static INLINE qword -CVT_16_USCALED(const qword *qw, const qword *shuffle) -{ - return si_cuflt(si_shufb(*qw, *qw, shuffle[2]), 0); -} - - -static INLINE qword -CVT_32_USCALED(const qword *qw, const qword *shuffle) -{ - (void) shuffle; - return si_cuflt(*qw, 0); -} - -static INLINE qword -CVT_8_SSCALED(const qword *qw, const qword *shuffle) -{ - return si_csflt(si_shufb(*qw, *qw, shuffle[1]), 0); -} - - -static INLINE qword -CVT_16_SSCALED(const qword *qw, const qword *shuffle) -{ - return si_csflt(si_shufb(*qw, *qw, shuffle[2]), 0); -} - - -static INLINE qword -CVT_32_SSCALED(const qword *qw, const qword *shuffle) -{ - (void) shuffle; - return si_csflt(*qw, 0); -} - - -static INLINE qword -CVT_8_UNORM(const qword *qw, const qword *shuffle) -{ - const qword scale = (qword) spu_splats(1.0f / 255.0f); - return si_fm(CVT_8_USCALED(qw, shuffle), scale); -} - - -static INLINE qword -CVT_16_UNORM(const qword *qw, const qword *shuffle) -{ - const qword scale = (qword) spu_splats(1.0f / 65535.0f); - return si_fm(CVT_16_USCALED(qw, shuffle), scale); -} - - -static INLINE qword -CVT_32_UNORM(const qword *qw, const qword *shuffle) -{ - const qword scale = (qword) spu_splats(1.0f / 4294967295.0f); - return si_fm(CVT_32_USCALED(qw, shuffle), scale); -} - - -static INLINE qword -CVT_8_SNORM(const qword *qw, const qword *shuffle) -{ - const qword scale = (qword) spu_splats(1.0f / 127.0f); - return si_fm(CVT_8_SSCALED(qw, shuffle), scale); -} - - -static INLINE qword -CVT_16_SNORM(const qword *qw, const qword *shuffle) -{ - const qword scale = (qword) spu_splats(1.0f / 32767.0f); - return si_fm(CVT_16_SSCALED(qw, shuffle), scale); -} - - -static INLINE qword -CVT_32_SNORM(const qword *qw, const qword *shuffle) -{ - const qword scale = (qword) spu_splats(1.0f / 2147483647.0f); - return si_fm(CVT_32_SSCALED(qw, shuffle), scale); -} - -#define SZ_4 si_il(0U) -#define SZ_3 si_fsmbi(0x000f) -#define SZ_2 si_fsmbi(0x00ff) -#define SZ_1 si_fsmbi(0x0fff) - -/** - * Fetch a float[4] vertex attribute from memory, doing format/type - * conversion as needed. - * - * This is probably needed/dupliocated elsewhere, eg format - * conversion, texture sampling etc. - */ -#define FETCH_ATTRIB( NAME, SZ, CVT, N ) \ -static void \ -fetch_##NAME(qword *out, const qword *in, qword defaults, \ - const qword *shuffle) \ -{ \ - qword tmp[4]; \ - \ - tmp[0] = si_selb(CVT(in + (0 * N), shuffle), defaults, SZ); \ - tmp[1] = si_selb(CVT(in + (1 * N), shuffle), defaults, SZ); \ - tmp[2] = si_selb(CVT(in + (2 * N), shuffle), defaults, SZ); \ - tmp[3] = si_selb(CVT(in + (3 * N), shuffle), defaults, SZ); \ - trans4x4(tmp[0], tmp[1], tmp[2], tmp[3], out, shuffle); \ -} - - -FETCH_ATTRIB( R64G64B64A64_FLOAT, SZ_4, CVT_64_FLOAT, 2 ) -FETCH_ATTRIB( R64G64B64_FLOAT, SZ_3, CVT_64_FLOAT, 2 ) -FETCH_ATTRIB( R64G64_FLOAT, SZ_2, CVT_64_FLOAT, 2 ) -FETCH_ATTRIB( R64_FLOAT, SZ_1, CVT_64_FLOAT, 2 ) - -FETCH_ATTRIB( R32G32B32A32_FLOAT, SZ_4, CVT_32_FLOAT, 1 ) -FETCH_ATTRIB( R32G32B32_FLOAT, SZ_3, CVT_32_FLOAT, 1 ) -FETCH_ATTRIB( R32G32_FLOAT, SZ_2, CVT_32_FLOAT, 1 ) -FETCH_ATTRIB( R32_FLOAT, SZ_1, CVT_32_FLOAT, 1 ) - -FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED, 1 ) -FETCH_ATTRIB( R32G32B32_USCALED, SZ_3, CVT_32_USCALED, 1 ) -FETCH_ATTRIB( R32G32_USCALED, SZ_2, CVT_32_USCALED, 1 ) -FETCH_ATTRIB( R32_USCALED, SZ_1, CVT_32_USCALED, 1 ) - -FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED, 1 ) -FETCH_ATTRIB( R32G32B32_SSCALED, SZ_3, CVT_32_SSCALED, 1 ) -FETCH_ATTRIB( R32G32_SSCALED, SZ_2, CVT_32_SSCALED, 1 ) -FETCH_ATTRIB( R32_SSCALED, SZ_1, CVT_32_SSCALED, 1 ) - -FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM, 1 ) -FETCH_ATTRIB( R32G32B32_UNORM, SZ_3, CVT_32_UNORM, 1 ) -FETCH_ATTRIB( R32G32_UNORM, SZ_2, CVT_32_UNORM, 1 ) -FETCH_ATTRIB( R32_UNORM, SZ_1, CVT_32_UNORM, 1 ) - -FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM, 1 ) -FETCH_ATTRIB( R32G32B32_SNORM, SZ_3, CVT_32_SNORM, 1 ) -FETCH_ATTRIB( R32G32_SNORM, SZ_2, CVT_32_SNORM, 1 ) -FETCH_ATTRIB( R32_SNORM, SZ_1, CVT_32_SNORM, 1 ) - -FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED, 1 ) -FETCH_ATTRIB( R16G16B16_USCALED, SZ_3, CVT_16_USCALED, 1 ) -FETCH_ATTRIB( R16G16_USCALED, SZ_2, CVT_16_USCALED, 1 ) -FETCH_ATTRIB( R16_USCALED, SZ_1, CVT_16_USCALED, 1 ) - -FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED, 1 ) -FETCH_ATTRIB( R16G16B16_SSCALED, SZ_3, CVT_16_SSCALED, 1 ) -FETCH_ATTRIB( R16G16_SSCALED, SZ_2, CVT_16_SSCALED, 1 ) -FETCH_ATTRIB( R16_SSCALED, SZ_1, CVT_16_SSCALED, 1 ) - -FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM, 1 ) -FETCH_ATTRIB( R16G16B16_UNORM, SZ_3, CVT_16_UNORM, 1 ) -FETCH_ATTRIB( R16G16_UNORM, SZ_2, CVT_16_UNORM, 1 ) -FETCH_ATTRIB( R16_UNORM, SZ_1, CVT_16_UNORM, 1 ) - -FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM, 1 ) -FETCH_ATTRIB( R16G16B16_SNORM, SZ_3, CVT_16_SNORM, 1 ) -FETCH_ATTRIB( R16G16_SNORM, SZ_2, CVT_16_SNORM, 1 ) -FETCH_ATTRIB( R16_SNORM, SZ_1, CVT_16_SNORM, 1 ) - -FETCH_ATTRIB( R8G8B8A8_USCALED, SZ_4, CVT_8_USCALED, 1 ) -FETCH_ATTRIB( R8G8B8_USCALED, SZ_3, CVT_8_USCALED, 1 ) -FETCH_ATTRIB( R8G8_USCALED, SZ_2, CVT_8_USCALED, 1 ) -FETCH_ATTRIB( R8_USCALED, SZ_1, CVT_8_USCALED, 1 ) - -FETCH_ATTRIB( R8G8B8A8_SSCALED, SZ_4, CVT_8_SSCALED, 1 ) -FETCH_ATTRIB( R8G8B8_SSCALED, SZ_3, CVT_8_SSCALED, 1 ) -FETCH_ATTRIB( R8G8_SSCALED, SZ_2, CVT_8_SSCALED, 1 ) -FETCH_ATTRIB( R8_SSCALED, SZ_1, CVT_8_SSCALED, 1 ) - -FETCH_ATTRIB( R8G8B8A8_UNORM, SZ_4, CVT_8_UNORM, 1 ) -FETCH_ATTRIB( R8G8B8_UNORM, SZ_3, CVT_8_UNORM, 1 ) -FETCH_ATTRIB( R8G8_UNORM, SZ_2, CVT_8_UNORM, 1 ) -FETCH_ATTRIB( R8_UNORM, SZ_1, CVT_8_UNORM, 1 ) - -FETCH_ATTRIB( R8G8B8A8_SNORM, SZ_4, CVT_8_SNORM, 1 ) -FETCH_ATTRIB( R8G8B8_SNORM, SZ_3, CVT_8_SNORM, 1 ) -FETCH_ATTRIB( R8G8_SNORM, SZ_2, CVT_8_SNORM, 1 ) -FETCH_ATTRIB( R8_SNORM, SZ_1, CVT_8_SNORM, 1 ) - -FETCH_ATTRIB( A8R8G8B8_UNORM, SZ_4, CVT_8_UNORM, 1 ) - - - -static spu_fetch_func get_fetch_func( enum pipe_format format ) -{ - switch (format) { - case PIPE_FORMAT_R64_FLOAT: - return fetch_R64_FLOAT; - case PIPE_FORMAT_R64G64_FLOAT: - return fetch_R64G64_FLOAT; - case PIPE_FORMAT_R64G64B64_FLOAT: - return fetch_R64G64B64_FLOAT; - case PIPE_FORMAT_R64G64B64A64_FLOAT: - return fetch_R64G64B64A64_FLOAT; - - case PIPE_FORMAT_R32_FLOAT: - return fetch_R32_FLOAT; - case PIPE_FORMAT_R32G32_FLOAT: - return fetch_R32G32_FLOAT; - case PIPE_FORMAT_R32G32B32_FLOAT: - return fetch_R32G32B32_FLOAT; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - return fetch_R32G32B32A32_FLOAT; - - case PIPE_FORMAT_R32_UNORM: - return fetch_R32_UNORM; - case PIPE_FORMAT_R32G32_UNORM: - return fetch_R32G32_UNORM; - case PIPE_FORMAT_R32G32B32_UNORM: - return fetch_R32G32B32_UNORM; - case PIPE_FORMAT_R32G32B32A32_UNORM: - return fetch_R32G32B32A32_UNORM; - - case PIPE_FORMAT_R32_USCALED: - return fetch_R32_USCALED; - case PIPE_FORMAT_R32G32_USCALED: - return fetch_R32G32_USCALED; - case PIPE_FORMAT_R32G32B32_USCALED: - return fetch_R32G32B32_USCALED; - case PIPE_FORMAT_R32G32B32A32_USCALED: - return fetch_R32G32B32A32_USCALED; - - case PIPE_FORMAT_R32_SNORM: - return fetch_R32_SNORM; - case PIPE_FORMAT_R32G32_SNORM: - return fetch_R32G32_SNORM; - case PIPE_FORMAT_R32G32B32_SNORM: - return fetch_R32G32B32_SNORM; - case PIPE_FORMAT_R32G32B32A32_SNORM: - return fetch_R32G32B32A32_SNORM; - - case PIPE_FORMAT_R32_SSCALED: - return fetch_R32_SSCALED; - case PIPE_FORMAT_R32G32_SSCALED: - return fetch_R32G32_SSCALED; - case PIPE_FORMAT_R32G32B32_SSCALED: - return fetch_R32G32B32_SSCALED; - case PIPE_FORMAT_R32G32B32A32_SSCALED: - return fetch_R32G32B32A32_SSCALED; - - case PIPE_FORMAT_R16_UNORM: - return fetch_R16_UNORM; - case PIPE_FORMAT_R16G16_UNORM: - return fetch_R16G16_UNORM; - case PIPE_FORMAT_R16G16B16_UNORM: - return fetch_R16G16B16_UNORM; - case PIPE_FORMAT_R16G16B16A16_UNORM: - return fetch_R16G16B16A16_UNORM; - - case PIPE_FORMAT_R16_USCALED: - return fetch_R16_USCALED; - case PIPE_FORMAT_R16G16_USCALED: - return fetch_R16G16_USCALED; - case PIPE_FORMAT_R16G16B16_USCALED: - return fetch_R16G16B16_USCALED; - case PIPE_FORMAT_R16G16B16A16_USCALED: - return fetch_R16G16B16A16_USCALED; - - case PIPE_FORMAT_R16_SNORM: - return fetch_R16_SNORM; - case PIPE_FORMAT_R16G16_SNORM: - return fetch_R16G16_SNORM; - case PIPE_FORMAT_R16G16B16_SNORM: - return fetch_R16G16B16_SNORM; - case PIPE_FORMAT_R16G16B16A16_SNORM: - return fetch_R16G16B16A16_SNORM; - - case PIPE_FORMAT_R16_SSCALED: - return fetch_R16_SSCALED; - case PIPE_FORMAT_R16G16_SSCALED: - return fetch_R16G16_SSCALED; - case PIPE_FORMAT_R16G16B16_SSCALED: - return fetch_R16G16B16_SSCALED; - case PIPE_FORMAT_R16G16B16A16_SSCALED: - return fetch_R16G16B16A16_SSCALED; - - case PIPE_FORMAT_R8_UNORM: - return fetch_R8_UNORM; - case PIPE_FORMAT_R8G8_UNORM: - return fetch_R8G8_UNORM; - case PIPE_FORMAT_R8G8B8_UNORM: - return fetch_R8G8B8_UNORM; - case PIPE_FORMAT_R8G8B8A8_UNORM: - return fetch_R8G8B8A8_UNORM; - - case PIPE_FORMAT_R8_USCALED: - return fetch_R8_USCALED; - case PIPE_FORMAT_R8G8_USCALED: - return fetch_R8G8_USCALED; - case PIPE_FORMAT_R8G8B8_USCALED: - return fetch_R8G8B8_USCALED; - case PIPE_FORMAT_R8G8B8A8_USCALED: - return fetch_R8G8B8A8_USCALED; - - case PIPE_FORMAT_R8_SNORM: - return fetch_R8_SNORM; - case PIPE_FORMAT_R8G8_SNORM: - return fetch_R8G8_SNORM; - case PIPE_FORMAT_R8G8B8_SNORM: - return fetch_R8G8B8_SNORM; - case PIPE_FORMAT_R8G8B8A8_SNORM: - return fetch_R8G8B8A8_SNORM; - - case PIPE_FORMAT_R8_SSCALED: - return fetch_R8_SSCALED; - case PIPE_FORMAT_R8G8_SSCALED: - return fetch_R8G8_SSCALED; - case PIPE_FORMAT_R8G8B8_SSCALED: - return fetch_R8G8B8_SSCALED; - case PIPE_FORMAT_R8G8B8A8_SSCALED: - return fetch_R8G8B8A8_SSCALED; - - case PIPE_FORMAT_A8R8G8B8_UNORM: - return fetch_A8R8G8B8_UNORM; - - case 0: - return NULL; /* not sure why this is needed */ - - default: - assert(0); - return NULL; - } -} - - -static unsigned get_vertex_size( enum pipe_format format ) -{ - switch (format) { - case PIPE_FORMAT_R64_FLOAT: - return 8; - case PIPE_FORMAT_R64G64_FLOAT: - return 2 * 8; - case PIPE_FORMAT_R64G64B64_FLOAT: - return 3 * 8; - case PIPE_FORMAT_R64G64B64A64_FLOAT: - return 4 * 8; - - case PIPE_FORMAT_R32_SSCALED: - case PIPE_FORMAT_R32_SNORM: - case PIPE_FORMAT_R32_USCALED: - case PIPE_FORMAT_R32_UNORM: - case PIPE_FORMAT_R32_FLOAT: - return 4; - case PIPE_FORMAT_R32G32_SSCALED: - case PIPE_FORMAT_R32G32_SNORM: - case PIPE_FORMAT_R32G32_USCALED: - case PIPE_FORMAT_R32G32_UNORM: - case PIPE_FORMAT_R32G32_FLOAT: - return 2 * 4; - case PIPE_FORMAT_R32G32B32_SSCALED: - case PIPE_FORMAT_R32G32B32_SNORM: - case PIPE_FORMAT_R32G32B32_USCALED: - case PIPE_FORMAT_R32G32B32_UNORM: - case PIPE_FORMAT_R32G32B32_FLOAT: - return 3 * 4; - case PIPE_FORMAT_R32G32B32A32_SSCALED: - case PIPE_FORMAT_R32G32B32A32_SNORM: - case PIPE_FORMAT_R32G32B32A32_USCALED: - case PIPE_FORMAT_R32G32B32A32_UNORM: - case PIPE_FORMAT_R32G32B32A32_FLOAT: - return 4 * 4; - - case PIPE_FORMAT_R16_SSCALED: - case PIPE_FORMAT_R16_SNORM: - case PIPE_FORMAT_R16_UNORM: - case PIPE_FORMAT_R16_USCALED: - return 2; - case PIPE_FORMAT_R16G16_SSCALED: - case PIPE_FORMAT_R16G16_SNORM: - case PIPE_FORMAT_R16G16_USCALED: - case PIPE_FORMAT_R16G16_UNORM: - return 2 * 2; - case PIPE_FORMAT_R16G16B16_SSCALED: - case PIPE_FORMAT_R16G16B16_SNORM: - case PIPE_FORMAT_R16G16B16_USCALED: - case PIPE_FORMAT_R16G16B16_UNORM: - return 3 * 2; - case PIPE_FORMAT_R16G16B16A16_SSCALED: - case PIPE_FORMAT_R16G16B16A16_SNORM: - case PIPE_FORMAT_R16G16B16A16_USCALED: - case PIPE_FORMAT_R16G16B16A16_UNORM: - return 4 * 2; - - case PIPE_FORMAT_R8_SSCALED: - case PIPE_FORMAT_R8_SNORM: - case PIPE_FORMAT_R8_USCALED: - case PIPE_FORMAT_R8_UNORM: - return 1; - case PIPE_FORMAT_R8G8_SSCALED: - case PIPE_FORMAT_R8G8_SNORM: - case PIPE_FORMAT_R8G8_USCALED: - case PIPE_FORMAT_R8G8_UNORM: - return 2 * 1; - case PIPE_FORMAT_R8G8B8_SSCALED: - case PIPE_FORMAT_R8G8B8_SNORM: - case PIPE_FORMAT_R8G8B8_USCALED: - case PIPE_FORMAT_R8G8B8_UNORM: - return 3 * 1; - case PIPE_FORMAT_A8R8G8B8_UNORM: - case PIPE_FORMAT_R8G8B8A8_SSCALED: - case PIPE_FORMAT_R8G8B8A8_SNORM: - case PIPE_FORMAT_R8G8B8A8_USCALED: - case PIPE_FORMAT_R8G8B8A8_UNORM: - return 4 * 1; - - case 0: - return 0; /* not sure why this is needed */ - - default: - assert(0); - return 0; - } -} - - /** * Fetch vertex attributes for 'count' vertices. */ @@ -612,10 +161,10 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, /* loop over vertex attributes (vertex shader inputs) */ for (attr = 0; attr < nr_attrs; attr++) { - const qword default_values = (qword)(vec_float4){ 0.0, 0.0, 0.0, 1.0 }; const unsigned pitch = draw->vertex_fetch.pitch[attr]; const uint64_t src = draw->vertex_fetch.src_ptr[attr]; - const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr]; + const spu_fetch_func fetch = (spu_fetch_func) + (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]); unsigned i; unsigned idx; const unsigned bytes_per_entry = draw->vertex_fetch.size[attr]; @@ -644,8 +193,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, /* Convert all 4 vertices to vectors of float. */ - (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, default_values, - fetch_shuffle_data); + (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data); } } @@ -662,12 +210,5 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw ) } - for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) { - draw->vertex_fetch.fetch[i] = - get_fetch_func(draw->vertex_fetch.format[i]); - draw->vertex_fetch.size[i] = - get_vertex_size(draw->vertex_fetch.format[i]); - } - draw->vertex_fetch.fetch_func = generic_vertex_fetch; } diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h index b5bf31e67d..0fb0bc28d0 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h @@ -6,8 +6,6 @@ struct spu_vs_context; -typedef void (*spu_fetch_func)(qword *out, const qword *in, qword defaults, - const qword *shuffle_data); typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw, struct spu_exec_machine *machine, const unsigned *elts, @@ -20,12 +18,12 @@ struct spu_vs_context { uint64_t src_ptr[PIPE_ATTRIB_MAX]; unsigned pitch[PIPE_ATTRIB_MAX]; unsigned size[PIPE_ATTRIB_MAX]; - enum pipe_format format[PIPE_ATTRIB_MAX]; + unsigned code_offset[PIPE_ATTRIB_MAX]; unsigned nr_attrs; boolean dirty; - spu_fetch_func fetch[PIPE_ATTRIB_MAX]; spu_full_fetch_func fetch_func; + void *code; } vertex_fetch; /* Clip derived state: -- cgit v1.2.3 From 66be2810c3be07dd1ee45a60cfc632725837f2cd Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 18 Feb 2008 18:55:39 -0800 Subject: Cell: emit vertex shaders and uniforms more intelligently --- src/gallium/drivers/cell/common.h | 22 ++++---- src/gallium/drivers/cell/ppu/cell_state_emit.c | 18 ++++++ src/gallium/drivers/cell/ppu/cell_vertex_shader.c | 17 +++--- src/gallium/drivers/cell/spu/spu_main.c | 9 +++ src/gallium/drivers/cell/spu/spu_vertex_fetch.c | 6 +- src/gallium/drivers/cell/spu/spu_vertex_shader.c | 68 ++++++++++------------- src/gallium/drivers/cell/spu/spu_vertex_shader.h | 5 ++ 7 files changed, 85 insertions(+), 60 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 74b131fbef..cf892206c6 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -87,10 +87,12 @@ #define CELL_CMD_STATE_TEXTURE 13 #define CELL_CMD_STATE_VERTEX_INFO 14 #define CELL_CMD_STATE_VIEWPORT 15 -#define CELL_CMD_STATE_VS_ARRAY_INFO 16 -#define CELL_CMD_STATE_BLEND 17 -#define CELL_CMD_VS_EXECUTE 18 -#define CELL_CMD_STATE_ATTRIB_FETCH 19 +#define CELL_CMD_STATE_UNIFORMS 16 +#define CELL_CMD_STATE_VS_ARRAY_INFO 17 +#define CELL_CMD_STATE_BIND_VS 18 +#define CELL_CMD_STATE_BLEND 19 +#define CELL_CMD_STATE_ATTRIB_FETCH 20 +#define CELL_CMD_VS_EXECUTE 21 #define CELL_NUM_BUFFERS 4 @@ -144,14 +146,13 @@ struct cell_attribute_fetch_code { struct cell_shader_info { - unsigned num_outputs; - uint64_t declarations; - unsigned num_declarations; uint64_t instructions; - unsigned num_instructions; - uint64_t uniforms; uint64_t immediates; + + unsigned num_outputs; + unsigned num_declarations; + unsigned num_instructions; unsigned num_immediates; } ALIGN16_ATTRIB; @@ -160,10 +161,9 @@ struct cell_shader_info struct cell_command_vs { uint64_t opcode; /**< CELL_CMD_VS_EXECUTE */ - struct cell_shader_info shader; + uint64_t vOut[SPU_VERTS_PER_BATCH]; unsigned num_elts; unsigned elts[SPU_VERTS_PER_BATCH]; - uint64_t vOut[SPU_VERTS_PER_BATCH]; float plane[12][4]; unsigned nr_planes; unsigned nr_attrs; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 5d2a786449..49c0d130c5 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -31,6 +31,8 @@ #include "cell_state_emit.h" #include "cell_batch.h" #include "cell_texture.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" static void @@ -100,4 +102,20 @@ cell_emit_state(struct cell_context *cell) emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO, &cell->vertex_info, sizeof(struct vertex_info)); } + + if (cell->dirty & CELL_NEW_VS) { + const struct draw_context *const draw = cell->draw; + struct cell_shader_info info; + + info.num_outputs = draw->num_vs_outputs; + info.declarations = (uintptr_t) draw->machine.Declarations; + info.num_declarations = draw->machine.NumDeclarations; + info.instructions = (uintptr_t) draw->machine.Instructions; + info.num_instructions = draw->machine.NumInstructions; + info.immediates = (uintptr_t) draw->machine.Imms; + info.num_immediates = draw->machine.ImmLimit / 4; + + emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, + & info, sizeof(info)); + } } diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c index 6a1d3bc20a..64c7821c19 100644 --- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c +++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c @@ -54,6 +54,7 @@ cell_vertex_shader_queue_flush(struct draw_context *draw) struct cell_command_vs *const vs = &cell_global.command[0].vs; uint64_t *batch; struct cell_array_info *array_info; + struct cell_shader_info *shader_info; unsigned i, j; struct cell_attribute_fetch_code *cf; @@ -100,17 +101,17 @@ cell_vertex_shader_queue_flush(struct draw_context *draw) (void) memcpy(&batch[1], &draw->viewport, sizeof(struct pipe_viewport_state)); + { + uint64_t uniforms = (uintptr_t) draw->user.constants; + + batch = cell_batch_alloc(cell, 2 *sizeof(batch[0])); + batch[0] = CELL_CMD_STATE_UNIFORMS; + batch[1] = uniforms; + } + cell_batch_flush(cell); vs->opcode = CELL_CMD_VS_EXECUTE; - vs->shader.num_outputs = draw->num_vs_outputs; - vs->shader.declarations = (uintptr_t) draw->machine.Declarations; - vs->shader.num_declarations = draw->machine.NumDeclarations; - vs->shader.instructions = (uintptr_t) draw->machine.Instructions; - vs->shader.num_instructions = draw->machine.NumInstructions; - vs->shader.uniforms = (uintptr_t) draw->user.constants; - vs->shader.immediates = (uintptr_t) draw->machine.Imms; - vs->shader.num_immediates = draw->machine.ImmLimit / 4; vs->nr_attrs = draw->vertex_fetch.nr_attrs; (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane)); diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index fcbf0f841e..dbc3705c24 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -433,10 +433,19 @@ cmd_batch(uint opcode) sizeof(struct pipe_viewport_state)); pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8); break; + case CELL_CMD_STATE_UNIFORMS: + draw.constants = (float (*)[4]) (uintptr_t) buffer[pos + 1]; + pos += 2; + break; case CELL_CMD_STATE_VS_ARRAY_INFO: cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); break; + case CELL_CMD_STATE_BIND_VS: + spu_bind_vertex_shader(&draw, + (struct cell_shader_info *) &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); + break; case CELL_CMD_STATE_ATTRIB_FETCH: { struct cell_attribute_fetch_code *code = (struct cell_attribute_fetch_code *) &buffer[pos+1]; diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c index 55c6c28717..e5d9910ff3 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c @@ -64,7 +64,7 @@ typedef void (*spu_fetch_func)(qword *out, const qword *in, const qword *shuffle_data); -static const qword fetch_shuffle_data[] = { +static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = { /* Shuffle used by CVT_64_FLOAT */ { @@ -108,7 +108,7 @@ static const qword fetch_shuffle_data[] = { static INLINE void fetch_unaligned(qword *dst, unsigned ea, unsigned size) { - qword tmp[4]; + qword tmp[4] ALIGN16_ATTRIB; const int shift = ea & 0x0f; const unsigned aligned_start_ea = ea & ~0x0f; const unsigned aligned_end_ea = (ea + size) & ~0x0f; @@ -169,7 +169,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, unsigned idx; const unsigned bytes_per_entry = draw->vertex_fetch.size[attr]; const unsigned quads_per_entry = (bytes_per_entry + 15) / 16; - qword in[2 * 4]; + qword in[2 * 4] ALIGN16_ATTRIB; /* Fetch four attributes for four vertices. diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c index 3f5bf41aa2..8363efeeb6 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c @@ -165,63 +165,55 @@ run_vertex_program(struct spu_vs_context *draw, } -static void -spu_bind_vertex_shader(struct spu_vs_context *draw, - void *uniforms, - void *planes, - unsigned nr_planes, - unsigned num_outputs - ) -{ - draw->constants = (float (*)[4]) uniforms; - - (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes); - draw->nr_planes = nr_planes; - draw->num_vs_outputs = num_outputs; - - /* specify the shader to interpret/execute */ - spu_exec_machine_init(&draw->machine, - PIPE_MAX_SAMPLERS, - NULL /*samplers*/, - PIPE_SHADER_VERTEX); -} - - unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32] ALIGN16_ATTRIB; + void -spu_execute_vertex_shader(struct spu_vs_context *draw, - const struct cell_command_vs *vs) +spu_bind_vertex_shader(struct spu_vs_context *draw, + struct cell_shader_info *vs) { - unsigned i; - - const uint64_t immediate_addr = vs->shader.immediates; + const unsigned immediate_addr = vs->immediates; const unsigned immediate_size = - ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates) - + (immediate_addr & 0x0f)); + ROUNDUP16((sizeof(float) * 4 * vs->num_immediates) + + (immediate_addr & 0x0f)); + mfc_get(immediates, immediate_addr & ~0x0f, immediate_size, TAG_VERTEX_BUFFER, 0, 0); draw->machine.Instructions = (struct tgsi_full_instruction *) - vs->shader.instructions; - draw->machine.NumInstructions = vs->shader.num_instructions; + vs->instructions; + draw->machine.NumInstructions = vs->num_instructions; draw->machine.Declarations = (struct tgsi_full_declaration *) - vs->shader.declarations; - draw->machine.NumDeclarations = vs->shader.num_declarations; + vs->declarations; + draw->machine.NumDeclarations = vs->num_declarations; - draw->vertex_fetch.nr_attrs = vs->nr_attrs; + draw->num_vs_outputs = vs->num_outputs; + + /* specify the shader to interpret/execute */ + spu_exec_machine_init(&draw->machine, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/, + PIPE_SHADER_VERTEX); wait_on_mask(1 << TAG_VERTEX_BUFFER); (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f], - sizeof(float) * 4 * vs->shader.num_immediates); + sizeof(float) * 4 * vs->num_immediates); +} - spu_bind_vertex_shader(draw, vs->shader.uniforms, - vs->plane, vs->nr_planes, - vs->shader.num_outputs); + +void +spu_execute_vertex_shader(struct spu_vs_context *draw, + const struct cell_command_vs *vs) +{ + unsigned i; + + (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes); + draw->nr_planes = vs->nr_planes; + draw->vertex_fetch.nr_attrs = vs->nr_attrs; for (i = 0; i < vs->num_elts; i += 4) { const unsigned batch_size = MIN2(vs->num_elts - i, 4); diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h index 0fb0bc28d0..54a4b8d9b9 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h @@ -1,6 +1,7 @@ #ifndef SPU_VERTEX_SHADER_H #define SPU_VERTEX_SHADER_H +#include "cell/common.h" #include "pipe/p_format.h" #include "spu_exec.h" @@ -54,6 +55,10 @@ static INLINE void spu_vertex_fetch(struct spu_vs_context *draw, struct cell_command_vs; +extern void +spu_bind_vertex_shader(struct spu_vs_context *draw, + struct cell_shader_info *vs); + extern void spu_execute_vertex_shader(struct spu_vs_context *draw, const struct cell_command_vs *vs); -- cgit v1.2.3 From 7c74037852a484a8a50e8bc540b954a624de4d33 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 20 Feb 2008 14:32:25 -0800 Subject: Cell: Initial pass at unified data cache --- src/gallium/drivers/cell/common.h | 8 ++ src/gallium/drivers/cell/ppu/cell_draw_arrays.c | 6 +- src/gallium/drivers/cell/ppu/cell_flush.c | 14 ++++ src/gallium/drivers/cell/spu/spu_dcache.c | 100 ++++++++++++++++++++++++ src/gallium/drivers/cell/spu/spu_dcache.h | 34 ++++++++ src/gallium/drivers/cell/spu/spu_exec.c | 49 ++++++------ src/gallium/drivers/cell/spu/spu_main.c | 8 ++ src/gallium/drivers/cell/spu/spu_vertex_fetch.c | 70 +---------------- 8 files changed, 194 insertions(+), 95 deletions(-) create mode 100644 src/gallium/drivers/cell/spu/spu_dcache.c create mode 100644 src/gallium/drivers/cell/spu/spu_dcache.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index cf892206c6..f32ad5bfbe 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -93,6 +93,7 @@ #define CELL_CMD_STATE_BLEND 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 #define CELL_CMD_VS_EXECUTE 21 +#define CELL_CMD_FLUSH_BUFFER_RANGE 22 #define CELL_NUM_BUFFERS 4 @@ -144,6 +145,13 @@ struct cell_attribute_fetch_code { uint size; }; + +struct cell_buffer_range { + uint64_t base; + unsigned size; +}; + + struct cell_shader_info { uint64_t declarations; diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c index f12613649b..cbd387f014 100644 --- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c +++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c @@ -49,9 +49,12 @@ cell_map_constant_buffers(struct cell_context *sp) struct pipe_winsys *ws = sp->pipe.winsys; uint i; for (i = 0; i < 2; i++) { - if (sp->constants[i].size) + if (sp->constants[i].size) { sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer, PIPE_BUFFER_USAGE_CPU_READ); + cell_flush_buffer_range(sp, sp->mapped_constants[i], + sp->constants[i].buffer->size); + } } draw_set_mapped_constant_buffer(sp->draw, @@ -124,6 +127,7 @@ cell_draw_elements(struct pipe_context *pipe, void *buf = pipe->winsys->buffer_map(pipe->winsys, sp->vertex_buffer[i].buffer, PIPE_BUFFER_USAGE_CPU_READ); + cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size); draw_set_mapped_vertex_buffer(draw, i, buf); } } diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c index 20f27531fc..66a5627d84 100644 --- a/src/gallium/drivers/cell/ppu/cell_flush.c +++ b/src/gallium/drivers/cell/ppu/cell_flush.c @@ -82,3 +82,17 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags) flushing = FALSE; } + + +void +cell_flush_buffer_range(struct cell_context *cell, void *ptr, + unsigned size) +{ + uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)]; + struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1]; + + batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE; + br->base = (uintptr_t) ptr; + br->size = size; + cell_batch_append(cell, batch, sizeof(batch)); +} diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c new file mode 100644 index 0000000000..9e30e17880 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_dcache.c @@ -0,0 +1,100 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "spu_main.h" +#include "spu_dcache.h" + +#define CACHE_NAME data +#define CACHED_TYPE qword +#define CACHE_TYPE CACHE_TYPE_RO +#define CACHE_SET_TAGID(set) TAG_VERTEX_BUFFER +#define CACHE_LOG2NNWAY 2 +#define CACHE_LOG2NSETS 6 +#include + +/* Yes folks, this is ugly. + */ +#undef CACHE_NWAY +#undef CACHE_NSETS +#define CACHE_NAME data +#define CACHE_NWAY 4 +#define CACHE_NSETS (1U << 6) + + +/** + * Fetch between arbitrary number of bytes from an unaligned address + */ +void +spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size) +{ + const int shift = ea & 0x0f; + const unsigned aligned_start_ea = ea & ~0x0f; + const unsigned aligned_end_ea = (ea + size) & ~0x0f; + const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1; + unsigned i; + + + if (shift == 0) { + /* Data is already aligned. Fetch directly into the destination buffer. + */ + for (i = 0; i < num_entries; i++) { + dst[i] = cache_rd(data, (ea & ~0x0f) + (i * 16)); + } + } else { + qword tmp[2] ALIGN16_ATTRIB; + + + tmp[0] = cache_rd(data, (ea & ~0x0f)); + for (i = 0; i < (num_entries & ~1); i++) { + const unsigned curr = i & 1; + const unsigned next = curr ^ 1; + + tmp[next] = cache_rd(data, (ea & ~0x0f) + (next * 16)); + + dst[i] = si_or((qword) spu_slqwbyte(tmp[curr], shift), + (qword) spu_rlmaskqwbyte(tmp[next], shift - 16)); + } + + if (i < num_entries) { + dst[i] = si_or((qword) spu_slqwbyte(tmp[(i & 1)], shift), + si_il(0)); + } + } +} + + +void +spu_dcache_mark_dirty(unsigned ea, unsigned size) +{ + unsigned i; + + (void) ea; + (void) size; + + /* Invalidate the whole cache for now. + */ + for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) { + CACHELINE_CLEARVALID(i); + } +} diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h new file mode 100644 index 0000000000..7a06b8c25a --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_dcache.h @@ -0,0 +1,34 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SPU_DCACHE_H +#define SPU_DCACHE_H + +extern void +spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size); + +extern void +spu_dcache_mark_dirty(unsigned ea, unsigned size); + +#endif /* SPU_DCACHE_H */ diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c index 0eb5ea1a3f..94ac6a2885 100644 --- a/src/gallium/drivers/cell/spu/spu_exec.c +++ b/src/gallium/drivers/cell/spu/spu_exec.c @@ -72,6 +72,7 @@ #include "spu_exec.h" #include "spu_main.h" #include "spu_vertex_shader.h" +#include "spu_dcache.h" #define TILE_TOP_LEFT 0 #define TILE_TOP_RIGHT 1 @@ -352,19 +353,17 @@ fetch_src_file_channel( case TGSI_EXTSWIZZLE_W: switch( file ) { case TGSI_FILE_CONSTANT: { - unsigned char buffer[32] ALIGN16_ATTRIB; unsigned i; for (i = 0; i < 4; i++) { const float *ptr = mach->Consts[index->i[i]]; - const uint64_t addr = (uint64_t)(uintptr_t) ptr; - const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32; + float tmp[4]; - mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0); - wait_on_mask(1 << TAG_VERTEX_BUFFER); + spu_dcache_fetch_unaligned((qword *) tmp, + (uintptr_t)(ptr + swizzle), + sizeof(float)); - (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) - + (sizeof(float) * swizzle)], sizeof(float)); + chan->f[i] = tmp[0]; } break; } @@ -1899,32 +1898,30 @@ spu_exec_machine_run( struct spu_exec_machine *mach ) /* execute declarations (interpolants) */ if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { for (i = 0; i < mach->NumDeclarations; i++) { - uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB; - struct tgsi_full_declaration decl; - unsigned long decl_addr = (unsigned long) (mach->Declarations+i); - unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f); + union { + struct tgsi_full_declaration decl; + qword buffer[2 * ((sizeof(struct tgsi_full_declaration) + 31) + / 32)]; + } d ALIGN16_ATTRIB; + unsigned ea = (unsigned) (mach->Declarations + pc); - mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0); - wait_on_mask(1 << TAG_INSTRUCTION_FETCH); + spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl)); - memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl)); - exec_declaration( mach, &decl ); + exec_declaration( mach, &d.decl ); } } /* execute instructions, until pc is set to -1 */ while (pc != -1) { - uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB; - struct tgsi_full_instruction inst; - unsigned long inst_addr = (unsigned long) (mach->Instructions + pc); - unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f); - - assert(pc < mach->NumInstructions); - mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0); - wait_on_mask(1 << TAG_INSTRUCTION_FETCH); - - memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst)); - exec_instruction( mach, & inst, &pc ); + union { + struct tgsi_full_instruction inst; + qword buffer[2 * ((sizeof(struct tgsi_full_instruction) + 31) + / 32)]; + } i ALIGN16_ATTRIB; + unsigned ea = (unsigned) (mach->Instructions + pc); + + spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst)); + exec_instruction( mach, & i.inst, &pc ); } #if 0 diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index dbc3705c24..1136dba62d 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -462,6 +462,14 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); break; } + case CELL_CMD_FLUSH_BUFFER_RANGE: { + struct cell_buffer_range *br = (struct cell_buffer_range *) + &buffer[pos+1]; + + spu_dcache_mark_dirty((unsigned) br->base, br->size); + pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8); + break; + } default: printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]); ASSERT(0); diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c index e5d9910ff3..f7e4e653e3 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c @@ -40,25 +40,7 @@ #include "spu_exec.h" #include "spu_vertex_shader.h" #include "spu_main.h" - -#define CACHE_NAME attribute -#define CACHED_TYPE qword -#define CACHE_TYPE CACHE_TYPE_RO -#define CACHE_SET_TAGID(set) TAG_VERTEX_BUFFER -#define CACHE_LOG2NNWAY 2 -#define CACHE_LOG2NSETS 6 -#include - -/* Yes folks, this is ugly. - */ -#undef CACHE_NWAY -#undef CACHE_NSETS -#define CACHE_NAME attribute -#define CACHE_NWAY 4 -#define CACHE_NSETS (1U << 6) - - -#define DRAW_DBG 0 +#include "spu_dcache.h" typedef void (*spu_fetch_func)(qword *out, const qword *in, const qword *shuffle_data); @@ -102,44 +84,6 @@ static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = { }; -/** - * Fetch between 1 and 32 bytes from an unaligned address - */ -static INLINE void -fetch_unaligned(qword *dst, unsigned ea, unsigned size) -{ - qword tmp[4] ALIGN16_ATTRIB; - const int shift = ea & 0x0f; - const unsigned aligned_start_ea = ea & ~0x0f; - const unsigned aligned_end_ea = (ea + size) & ~0x0f; - const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1; - unsigned i; - - - if (shift == 0) { - /* Data is already aligned. Fetch directly into the destination buffer. - */ - for (i = 0; i < num_entries; i++) { - dst[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16)); - } - } else { - /* Fetch data from the cache to the local buffer. - */ - for (i = 0; i < num_entries; i++) { - tmp[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16)); - } - - - /* Fix the alignment of the data and write to the destination buffer. - */ - for (i = 0; i < ((size + 15) / 16); i++) { - dst[i] = si_or((qword) spu_slqwbyte(tmp[i], shift), - (qword) spu_rlmaskqwbyte(tmp[i + 1], shift - 16)); - } - } -} - - /** * Fetch vertex attributes for 'count' vertices. */ @@ -182,7 +126,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, printf("SPU: fetching = 0x%llx\n", addr); #endif - fetch_unaligned(& in[idx], addr, bytes_per_entry); + spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry); idx += quads_per_entry; } @@ -200,15 +144,5 @@ static void generic_vertex_fetch(struct spu_vs_context *draw, void spu_update_vertex_fetch( struct spu_vs_context *draw ) { - unsigned i; - - - /* Invalidate the vertex cache. - */ - for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) { - CACHELINE_CLEARVALID(i); - } - - draw->vertex_fetch.fetch_func = generic_vertex_fetch; } -- cgit v1.2.3 From de5c64e0af0b1a1ce3ee12f361341880dc260868 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Thu, 21 Feb 2008 10:32:02 -0800 Subject: Cell: Remove erroneous ALIGN16_ATTRIB attributes If a structure is marked as being aligned the SPE compiler performs extra optimizations (sadly, only -O2 is used) when reading the structure. Since most of the structures sent in batch buffers are only 8-byte aligned, this resulted in mysterous bugs with -O2. --- src/gallium/drivers/cell/common.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index f32ad5bfbe..9a4004535e 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -137,7 +137,7 @@ struct cell_array_info uint pitch; /**< Byte pitch from one entry to the next. */ uint size; uint function_offset; -} ALIGN16_ATTRIB; +}; struct cell_attribute_fetch_code { @@ -162,7 +162,7 @@ struct cell_shader_info unsigned num_declarations; unsigned num_instructions; unsigned num_immediates; -} ALIGN16_ATTRIB; +}; #define SPU_VERTS_PER_BATCH 64 @@ -175,7 +175,7 @@ struct cell_command_vs float plane[12][4]; unsigned nr_planes; unsigned nr_attrs; -} ALIGN16_ATTRIB; +}; struct cell_command_render -- cgit v1.2.3 From 1936e4bdfd776f78f9fe44f77ce66066fd166360 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 17 Mar 2008 15:45:52 -0700 Subject: cell: Initial code-gen for alpha / stencil / depth testing Alpha test is currently broken because all per-fragment testing occurs before alpha is calculated. Stencil test is currently broken because the Z-clear code asserts if there is a stencil buffer. --- src/gallium/drivers/cell/common.h | 10 + src/gallium/drivers/cell/ppu/Makefile | 1 + src/gallium/drivers/cell/ppu/cell_context.h | 25 +- src/gallium/drivers/cell/ppu/cell_pipe_state.c | 37 +- src/gallium/drivers/cell/ppu/cell_state_emit.c | 24 +- .../drivers/cell/ppu/cell_state_per_fragment.c | 1020 ++++++++++++++++++++ .../drivers/cell/ppu/cell_state_per_fragment.h | 35 + src/gallium/drivers/cell/spu/Makefile | 1 + src/gallium/drivers/cell/spu/spu_main.c | 25 +- src/gallium/drivers/cell/spu/spu_main.h | 16 +- src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 191 ++++ src/gallium/drivers/cell/spu/spu_per_fragment_op.h | 32 + src/gallium/drivers/cell/spu/spu_render.c | 4 +- src/gallium/drivers/cell/spu/spu_tri.c | 23 +- src/gallium/drivers/cell/spu/spu_ztest.h | 135 --- 15 files changed, 1409 insertions(+), 170 deletions(-) create mode 100644 src/gallium/drivers/cell/ppu/cell_state_per_fragment.c create mode 100644 src/gallium/drivers/cell/ppu/cell_state_per_fragment.h create mode 100644 src/gallium/drivers/cell/spu/spu_per_fragment_op.c create mode 100644 src/gallium/drivers/cell/spu/spu_per_fragment_op.h delete mode 100644 src/gallium/drivers/cell/spu/spu_ztest.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 9a4004535e..fe93fd8e1a 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -104,6 +104,16 @@ +/** + */ +struct cell_command_depth_stencil_alpha_test { + uint64_t base; /**< Effective address of code start. */ + unsigned size; /**< Size in bytes of test code. */ + unsigned read_depth; /**< Flag: should depth be read? */ + unsigned read_stencil; /**< Flag: should stencil be read? */ +}; + + /** * Tell SPUs about the framebuffer size, location */ diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index d38fa6ce07..0389a9554c 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -27,6 +27,7 @@ SOURCES = \ cell_flush.c \ cell_state_derived.c \ cell_state_emit.c \ + cell_state_per_fragment.c \ cell_state_shader.c \ cell_pipe_state.c \ cell_screen.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index b221424323..9e79db0ace 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -57,16 +57,37 @@ struct cell_fragment_shader_state }; +struct cell_blend_state { + struct pipe_blend_state base; + + /** + * Generated code to perform alpha blending + */ + struct spe_function code; +}; + + +struct cell_depth_stencil_alpha_state { + struct pipe_depth_stencil_alpha_state base; + + /** + * Generated code to perform alpha, stencil, and depth testing on the SPE + */ + struct spe_function code; + +}; + + struct cell_context { struct pipe_context pipe; struct cell_winsys *winsys; - const struct pipe_blend_state *blend; + const struct cell_blend_state *blend; const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS]; uint num_samplers; - const struct pipe_depth_stencil_alpha_state *depth_stencil; + const struct cell_depth_stencil_alpha_state *depth_stencil; const struct pipe_rasterizer_state *rasterizer; const struct cell_vertex_shader_state *vs; const struct cell_fragment_shader_state *fs; diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c index 025ed3bbbf..66ede99d13 100644 --- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c +++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c @@ -36,6 +36,7 @@ #include "cell_context.h" #include "cell_state.h" #include "cell_texture.h" +#include "cell_state_per_fragment.h" @@ -43,7 +44,12 @@ static void * cell_create_blend_state(struct pipe_context *pipe, const struct pipe_blend_state *blend) { - return mem_dup(blend, sizeof(*blend)); + struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state)); + + (void) memcpy(cb, blend, sizeof(*blend)); + cb->code.store = NULL; + + return cb; } @@ -54,7 +60,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend) draw_flush(cell->draw); - cell->blend = (const struct pipe_blend_state *)blend; + cell->blend = (const struct cell_blend_state *)blend; cell->dirty |= CELL_NEW_BLEND; } @@ -63,7 +69,10 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend) static void cell_delete_blend_state(struct pipe_context *pipe, void *blend) { - FREE(blend); + struct cell_blend_state *cb = (struct cell_blend_state *) blend; + + spe_release_func(& cb->code); + FREE(cb); } @@ -87,7 +96,13 @@ static void * cell_create_depth_stencil_alpha_state(struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *depth_stencil) { - return mem_dup(depth_stencil, sizeof(*depth_stencil)); + struct cell_depth_stencil_alpha_state *cdsa = + MALLOC(sizeof(struct cell_depth_stencil_alpha_state)); + + (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil)); + cdsa->code.store = NULL; + + return cdsa; } @@ -96,12 +111,16 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth_stencil) { struct cell_context *cell = cell_context(pipe); + struct cell_depth_stencil_alpha_state *cdsa = + (struct cell_depth_stencil_alpha_state *) depth_stencil; draw_flush(cell->draw); - cell->depth_stencil - = (const struct pipe_depth_stencil_alpha_state *) depth_stencil; + if (cdsa->code.store == NULL) { + cell_generate_depth_stencil_test(cdsa); + } + cell->depth_stencil = cdsa; cell->dirty |= CELL_NEW_DEPTH_STENCIL; } @@ -109,7 +128,11 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe, static void cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth) { - FREE(depth); + struct cell_depth_stencil_alpha_state *cdsa = + (struct cell_depth_stencil_alpha_state *) depth; + + spe_release_func(& cdsa->code); + FREE(cdsa); } diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 670eb26bdd..c8d5fdf709 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -71,9 +71,27 @@ cell_emit_state(struct cell_context *cell) } if (cell->dirty & CELL_NEW_DEPTH_STENCIL) { - emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, - cell->depth_stencil, - sizeof(struct pipe_depth_stencil_alpha_state)); + struct cell_command_depth_stencil_alpha_test dsat; + + + dsat.base = (intptr_t) cell->depth_stencil->code.store; + dsat.size = (char *) cell->depth_stencil->code.csr + - (char *) cell->depth_stencil->code.store; + dsat.read_depth = TRUE; + dsat.read_stencil = FALSE; + + { + uint32_t *p = cell->depth_stencil->code.store; + + printf("\t.text\n"); + for (/* empty */; p < cell->depth_stencil->code.csr; p++) { + printf("\t.long\t0x%04x\n", *p); + } + fflush(stdout); + } + + emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat, + sizeof(dsat)); } if (cell->dirty & CELL_NEW_SAMPLER) { diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c new file mode 100644 index 0000000000..60b64438d8 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c @@ -0,0 +1,1020 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file + * Generate code to perform all per-fragment operations. + * + * Code generated by these functions perform both alpha, depth, and stencil + * testing as well as alpha blending. + * + * \note + * Occlusion query is not supported, but this is the right place to add that + * support. + * + * \author Ian Romanick + */ + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" + +#include "cell_context.h" + +#include "rtasm/rtasm_ppc_spe.h" + + +/** + * Generate code to perform alpha testing. + * + * The code generated by this function uses the register specificed by + * \c mask as both an input and an output. + * + * \param dsa Current alpha-test state + * \param f Function to which code should be appended + * \param mask Index of register containing active fragment mask + * \param alphas Index of register containing per-fragment alpha values + * + * \note Emits a maximum of 6 instructions. + */ +static void +emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, int mask, int alphas) +{ + /* If the alpha function is either NEVER or ALWAYS, there is no need to + * load the reference value into a register. ALWAYS is a fairly common + * case, and this optimization saves 2 instructions. + */ + if (dsa->alpha.enabled + && (dsa->alpha.func != PIPE_FUNC_NEVER) + && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) { + int ref = spe_allocate_available_register(f); + int tmp_a = spe_allocate_available_register(f); + int tmp_b = spe_allocate_available_register(f); + union { + float f; + unsigned u; + } ref_val; + boolean complement = FALSE; + + ref_val.f = dsa->alpha.ref; + + spe_il(f, ref, ref_val.u & 0x0000ffff); + spe_ilh(f, ref, ref_val.u >> 16); + + switch (dsa->alpha.func) { + case PIPE_FUNC_NOTEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + + case PIPE_FUNC_EQUAL: + spe_fceq(f, tmp_a, ref, alphas); + break; + + case PIPE_FUNC_LEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + + case PIPE_FUNC_GREATER: + spe_fcgt(f, tmp_a, ref, alphas); + break; + + case PIPE_FUNC_LESS: + complement = TRUE; + /* FALLTHROUGH */ + + case PIPE_FUNC_GEQUAL: + spe_fcgt(f, tmp_a, ref, alphas); + spe_fceq(f, tmp_b, ref, alphas); + spe_or(f, tmp_a, tmp_b, tmp_a); + break; + + case PIPE_FUNC_ALWAYS: + case PIPE_FUNC_NEVER: + default: + assert(0); + break; + } + + if (complement) { + spe_andc(f, mask, mask, tmp_a); + } else { + spe_and(f, mask, mask, tmp_a); + } + + spe_release_register(f, ref); + spe_release_register(f, tmp_a); + spe_release_register(f, tmp_b); + } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) { + spe_il(f, mask, 0); + } +} + + +/** + * \param dsa Current depth-test state + * \param f Function to which code should be appended + * \param m Mask of allocated / free SPE registers + * \param mask Index of register to contain depth-pass mask + * \param stored Index of register containing values from depth buffer + * \param calculated Index of register containing per-fragment depth values + * + * \return + * If the calculated depth comparison mask is the actual mask, \c FALSE is + * returned. If the calculated depth comparison mask is the compliment of + * the actual mask, \c TRUE is returned. + * + * \note Emits a maximum of 3 instructions. + */ +static boolean +emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, int mask, int stored, int calculated) +{ + unsigned func = (dsa->depth.enabled) + ? dsa->depth.func : PIPE_FUNC_ALWAYS; + int tmp = spe_allocate_available_register(f); + boolean compliment = FALSE; + + switch (func) { + case PIPE_FUNC_NEVER: + spe_il(f, mask, 0); + break; + + case PIPE_FUNC_NOTEQUAL: + compliment = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_EQUAL: + spe_ceq(f, mask, calculated, stored); + break; + + case PIPE_FUNC_LEQUAL: + compliment = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GREATER: + spe_clgt(f, mask, calculated, stored); + break; + + case PIPE_FUNC_LESS: + compliment = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GEQUAL: + spe_clgt(f, mask, calculated, stored); + spe_ceq(f, tmp, calculated, stored); + spe_or(f, mask, mask, tmp); + break; + + case PIPE_FUNC_ALWAYS: + spe_il(f, mask, ~0); + break; + + default: + assert(0); + break; + } + + spe_release_register(f, tmp); + return compliment; +} + + +/** + * \note Emits a maximum of 5 instructions. + */ +static void +emit_stencil_op(struct spe_function *f, + int out, int in, int mask, unsigned op, unsigned ref) +{ + const int clamp = spe_allocate_available_register(f); + const int tmp = spe_allocate_available_register(f); + + switch(op) { + case PIPE_STENCIL_OP_KEEP: + assert(0); + case PIPE_STENCIL_OP_ZERO: + spe_il(f, out, 0); + break; + case PIPE_STENCIL_OP_REPLACE: + spe_il(f, out, ref); + break; + case PIPE_STENCIL_OP_INCR: + spe_il(f, clamp, 0x0ff); + spe_ai(f, out, in, 1); + spe_cgti(f, tmp, out, clamp); + spe_selb(f, out, out, clamp, tmp); + break; + case PIPE_STENCIL_OP_DECR: + spe_il(f, clamp, 0); + spe_ai(f, out, in, -1); + spe_cgti(f, tmp, out, clamp); + spe_selb(f, out, clamp, out, tmp); + break; + case PIPE_STENCIL_OP_INCR_WRAP: + spe_ai(f, out, in, 1); + break; + case PIPE_STENCIL_OP_DECR_WRAP: + spe_ai(f, out, in, -1); + break; + case PIPE_STENCIL_OP_INVERT: + spe_nor(f, out, in, in); + break; + default: + assert(0); + } + + spe_release_register(f, tmp); + spe_release_register(f, clamp); + + spe_selb(f, out, in, out, mask); +} + + +/** + * \param dsa Depth / stencil test state + * \param face 0 for front face, 1 for back face + * \param f Function to append instructions to + * \param reg_mask Mask of allocated registers + * \param mask Register containing mask of fragments passing the + * alpha test + * \param depth_mask Register containing mask of fragments passing the + * depth test + * \param depth_compliment Is \c depth_mask the compliment of the actual mask? + * \param stencil Register containing values from stencil buffer + * \param depth_pass Register to store mask of fragments passing stencil test + * and depth test + * + * \note + * Emits a maximum of 10 + (3 * 5) = 25 instructions. + */ +static int +emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa, + unsigned face, + struct spe_function *f, + int mask, + int depth_mask, + boolean depth_complement, + int stencil, + int depth_pass) +{ + int stencil_fail = spe_allocate_available_register(f); + int depth_fail = spe_allocate_available_register(f); + int stencil_mask = spe_allocate_available_register(f); + int stencil_pass = spe_allocate_available_register(f); + int face_stencil = spe_allocate_available_register(f); + int stencil_src = stencil; + const unsigned ref = (dsa->stencil[face].ref_value + & dsa->stencil[face].value_mask); + boolean complement = FALSE; + int stored = spe_allocate_available_register(f); + int tmp = spe_allocate_available_register(f); + + + if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) + && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS) + && (dsa->stencil[face].value_mask != 0x0ff)) { + spe_andi(f, stored, stencil, dsa->stencil[face].value_mask); + } + + + switch (dsa->stencil[face].func) { + case PIPE_FUNC_NEVER: + spe_il(f, stencil_mask, 0); + break; + + case PIPE_FUNC_NOTEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_EQUAL: + spe_ceqi(f, stencil_mask, stored, ref); + break; + + case PIPE_FUNC_LEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GREATER: + spe_clgti(f, stencil_mask, stored, ref); + break; + + case PIPE_FUNC_LESS: + complement = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GEQUAL: + spe_clgti(f, stencil_mask, stored, ref); + spe_ceqi(f, tmp, stored, ref); + spe_or(f, stencil_mask, stencil_mask, tmp); + break; + + case PIPE_FUNC_ALWAYS: + /* See comment below. */ + break; + + default: + assert(0); + break; + } + + spe_release_register(f, stored); + spe_release_register(f, tmp); + + + /* ALWAYS is a very common stencil-test, so some effort is applied to + * optimize that case. The stencil-pass mask is the same as the input + * fragment mask. This makes the stencil-test (above) a no-op, and the + * input fragment mask can be "renamed" the stencil-pass mask. + */ + if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) { + spe_release_register(f, stencil_pass); + stencil_pass = mask; + } else { + if (complement) { + spe_andc(f, stencil_pass, mask, stencil_mask); + } else { + spe_and(f, stencil_pass, mask, stencil_mask); + } + } + + if (depth_complement) { + spe_andc(f, depth_pass, stencil_pass, depth_mask); + } else { + spe_and(f, depth_pass, stencil_pass, depth_mask); + } + + + /* Conditionally emit code to update the stencil value under various + * condititons. Note that there is no need to generate code under the + * following circumstances: + * + * - Stencil write mask is zero. + * - For stencil-fail if the stencil test is ALWAYS + * - For depth-fail if the stencil test is NEVER + * - For depth-pass if the stencil test is NEVER + * - Any of the 3 conditions if the operation is KEEP + */ + if (dsa->stencil[face].write_mask != 0) { + if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS) + && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) { + if (complement) { + spe_and(f, stencil_fail, mask, stencil_mask); + } else { + spe_andc(f, stencil_fail, mask, stencil_mask); + } + + emit_stencil_op(f, face_stencil, stencil_src, stencil_fail, + dsa->stencil[face].fail_op, + dsa->stencil[face].ref_value); + + stencil_src = face_stencil; + } + + if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) + && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) { + if (depth_complement) { + spe_and(f, depth_fail, stencil_pass, depth_mask); + } else { + spe_andc(f, depth_fail, stencil_pass, depth_mask); + } + + emit_stencil_op(f, face_stencil, stencil_src, depth_fail, + dsa->stencil[face].zfail_op, + dsa->stencil[face].ref_value); + stencil_src = face_stencil; + } + + if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) + && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) { + emit_stencil_op(f, face_stencil, stencil_src, depth_pass, + dsa->stencil[face].zpass_op, + dsa->stencil[face].ref_value); + stencil_src = face_stencil; + } + } + + spe_release_register(f, stencil_fail); + spe_release_register(f, depth_fail); + spe_release_register(f, stencil_mask); + if (stencil_pass != mask) { + spe_release_register(f, stencil_pass); + } + + /* If all of the stencil operations were KEEP or the stencil write mask was + * zero, "stencil_src" will still be set to "stencil". In this case + * release the "face_stencil" register. Otherwise apply the stencil write + * mask to select bits from the calculated stencil value and the previous + * stencil value. + */ + if (stencil_src == stencil) { + spe_release_register(f, face_stencil); + } else if (dsa->stencil[face].write_mask != 0x0ff) { + int tmp = spe_allocate_available_register(f); + + spe_il(f, tmp, dsa->stencil[face].write_mask); + spe_selb(f, stencil_src, stencil, stencil_src, tmp); + + spe_release_register(f, tmp); + } + + return stencil_src; +} + + +void +cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa) +{ + struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base; + struct spe_function *const f = &cdsa->code; + + /* This code generates a maximum of 6 (alpha test) + 3 (depth test) + * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions. Round + * up to 64 to make it a happy power-of-two. + */ + spe_init_func(f, 4 * 64); + + + /* Allocate registers for the function's input parameters. Cleverly (and + * clever code is usually dangerous, but I couldn't resist) the generated + * function returns a structure. Returned structures start with register + * 3, and the structure fields are ordered to match up exactly with the + * input parameters. + */ + int mask = spe_allocate_register(f, 3); + int depth = spe_allocate_register(f, 4); + int stencil = spe_allocate_register(f, 5); + int zvals = spe_allocate_register(f, 6); + int frag_a = spe_allocate_register(f, 7); + int facing = spe_allocate_register(f, 8); + + int depth_mask = spe_allocate_available_register(f); + + boolean depth_complement; + + + emit_alpha_test(dsa, f, mask, frag_a); + + depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals); + + if (dsa->stencil[0].enabled) { + const int front_depth_pass = spe_allocate_available_register(f); + int front_stencil = emit_stencil_test(dsa, 0, f, mask, + depth_mask, depth_complement, + stencil, front_depth_pass); + + if (dsa->stencil[1].enabled) { + const int back_depth_pass = spe_allocate_available_register(f); + int back_stencil = emit_stencil_test(dsa, 1, f, mask, + depth_mask, depth_complement, + stencil, back_depth_pass); + + /* If the front facing stencil value and the back facing stencil + * value are stored in the same register, there is no need to select + * a value based on the facing. This can happen if the stencil value + * was not modified due to the write masks being zero, the stencil + * operations being KEEP, etc. + */ + if (front_stencil != back_stencil) { + spe_selb(f, stencil, back_stencil, front_stencil, facing); + } + + if (back_stencil != stencil) { + spe_release_register(f, back_stencil); + } + + if (front_stencil != stencil) { + spe_release_register(f, front_stencil); + } + + spe_selb(f, mask, back_depth_pass, front_depth_pass, facing); + + spe_release_register(f, back_depth_pass); + } else { + if (front_stencil != stencil) { + spe_or(f, stencil, front_stencil, front_stencil); + spe_release_register(f, front_stencil); + } + } + + spe_release_register(f, front_depth_pass); + } else if (dsa->depth.enabled) { + if (depth_complement) { + spe_andc(f, mask, mask, depth_mask); + } else { + spe_and(f, mask, mask, depth_mask); + } + } + + if (dsa->depth.writemask) { + spe_selb(f, depth, depth, zvals, mask); + } + + spe_bi(f, 0, 0, 0); +} + + +/** + * \note Emits a maximum of 3 instructions + */ +static int +emit_alpha_factor_calculation(struct spe_function *f, + unsigned factor, float const_alpha, + int src_alpha, int dst_alpha) +{ + union { + float f; + unsigned u; + } alpha; + int factor_reg; + int tmp; + + + alpha.f = const_alpha; + + switch (factor) { + case PIPE_BLENDFACTOR_ONE: + factor_reg = -1; + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA: + factor_reg = spe_allocate_available_register(f); + + spe_or(f, factor_reg, src_alpha, src_alpha); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: + factor_reg = dst_alpha; + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + factor_reg = -1; + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + const_alpha = 1.0 - const_alpha; + /* FALLTHROUGH */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + factor_reg = spe_allocate_available_register(f); + + spe_il(f, factor_reg, alpha.u & 0x0ffff); + spe_ilh(f, factor_reg, alpha.u >> 16); + break; + + case PIPE_BLENDFACTOR_ZERO: + factor_reg = -1; + break; + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + tmp = spe_allocate_available_register(f); + factor_reg = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor_reg, tmp, src_alpha); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + tmp = spe_allocate_available_register(f); + factor_reg = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor_reg, tmp, dst_alpha); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: + assert(0); + factor_reg = -1; + break; + } + + return factor_reg; +} + + +/** + * \note Emits a maximum of 5 instructions + */ +static void +emit_color_factor_calculation(struct spe_function *f, + unsigned sF, unsigned mask, + const struct pipe_blend_color *blend_color, + const int *src, + const int *dst, + int *factor) +{ + union { + float f[4]; + unsigned u[4]; + } color; + int tmp; + unsigned i; + + + color.f[0] = blend_color->color[0]; + color.f[1] = blend_color->color[1]; + color.f[2] = blend_color->color[2]; + color.f[3] = blend_color->color[3]; + + factor[0] = -1; + factor[1] = -1; + factor[2] = -1; + factor[3] = -1; + + switch (sF) { + case PIPE_BLENDFACTOR_ONE: + break; + + case PIPE_BLENDFACTOR_SRC_COLOR: + for (i = 0; i < 3; ++i) { + if ((mask & (1U << i)) != 0) { + factor[i] = spe_allocate_available_register(f); + spe_or(f, factor[i], src[i], src[i]); + } + } + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA: + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + spe_or(f, factor[0], src[3], src[3]); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: + factor[0] = dst[3]; + factor[1] = dst[3]; + factor[2] = dst[3]; + break; + + case PIPE_BLENDFACTOR_DST_COLOR: + factor[0] = dst[0]; + factor[1] = dst[1]; + factor[2] = dst[2]; + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + tmp = spe_allocate_available_register(f); + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + /* Alpha saturate means min(As, 1-Ad). + */ + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, tmp, tmp, dst[3]); + spe_fcgt(f, factor[0], tmp, src[3]); + spe_selb(f, factor[0], src[3], tmp, factor[0]); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + color.f[0] = 1.0 - color.f[0]; + color.f[1] = 1.0 - color.f[1]; + color.f[2] = 1.0 - color.f[2]; + /* FALLTHROUGH */ + case PIPE_BLENDFACTOR_CONST_COLOR: + for (i = 0; i < 3; i++) { + factor[i] = spe_allocate_available_register(f); + + spe_il(f, factor[i], color.u[i] & 0x0ffff); + spe_ilh(f, factor[i], color.u[i] >> 16); + } + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + color.f[3] = 1.0 - color.f[3]; + /* FALLTHROUGH */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + spe_il(f, factor[0], color.u[3] & 0x0ffff); + spe_ilh(f, factor[0], color.u[3] >> 16); + break; + + case PIPE_BLENDFACTOR_ZERO: + break; + + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + tmp = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + + for (i = 0; i < 3; ++i) { + if ((mask & (1U << i)) != 0) { + factor[i] = spe_allocate_available_register(f); + spe_fs(f, factor[i], tmp, src[i]); + } + } + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + tmp = spe_allocate_available_register(f); + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor[0], tmp, src[3]); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + tmp = spe_allocate_available_register(f); + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor[0], tmp, dst[3]); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_DST_COLOR: + tmp = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + + for (i = 0; i < 3; ++i) { + if ((mask & (1U << i)) != 0) { + factor[i] = spe_allocate_available_register(f); + spe_fs(f, factor[i], tmp, dst[i]); + } + } + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: + assert(0); + } +} + + +static void +emit_blend_calculation(struct spe_function *f, + unsigned func, unsigned sF, unsigned dF, + int src, int src_factor, int dst, int dst_factor) +{ + int tmp = spe_allocate_available_register(f); + + switch (func) { + case PIPE_BLEND_ADD: + if (sF == PIPE_BLENDFACTOR_ONE) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + /* Do nothing. */ + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_fa(f, src, src, dst); + } + } else if (sF == PIPE_BLENDFACTOR_ZERO) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, src, 0); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_or(f, src, dst, dst); + } + } else { + spe_fm(f, tmp, dst, dst_factor); + spe_fma(f, src, src, src_factor, tmp); + } + break; + + case PIPE_BLEND_SUBTRACT: + if (sF == PIPE_BLENDFACTOR_ONE) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + /* Do nothing. */ + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_fs(f, src, src, dst); + } + } else if (sF == PIPE_BLENDFACTOR_ZERO) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, src, 0); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_il(f, tmp, 0); + spe_fs(f, src, tmp, dst); + } + } else { + spe_fm(f, tmp, dst, dst_factor); + spe_fms(f, src, src, src_factor, tmp); + } + break; + + case PIPE_BLEND_REVERSE_SUBTRACT: + if (sF == PIPE_BLENDFACTOR_ONE) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, tmp, 0); + spe_fs(f, src, tmp, src); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_fs(f, src, dst, src); + } + } else if (sF == PIPE_BLENDFACTOR_ZERO) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, src, 0); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_or(f, src, dst, dst); + } + } else { + spe_fm(f, tmp, src, src_factor); + spe_fms(f, src, src, dst_factor, tmp); + } + break; + + case PIPE_BLEND_MIN: + spe_cgt(f, tmp, src, dst); + spe_selb(f, src, dst, src, tmp); + break; + + case PIPE_BLEND_MAX: + spe_cgt(f, tmp, src, dst); + spe_selb(f, src, src, dst, tmp); + break; + + default: + assert(0); + } + + spe_release_register(f, tmp); +} + + +/** + * Generate code to perform alpha blending on the SPE + */ +void +cell_generate_alpha_blend(struct cell_blend_state *cb, + const struct pipe_blend_color *blend_color) +{ + struct pipe_blend_state *const b = &cb->base; + struct spe_function *const f = &cb->code; + + /* This code generates a maximum of 3 (source alpha factor) + * + 3 (destination alpha factor) + (3 * 5) (source color factor) + * + (3 * 5) (destination color factor) + (4 * 2) (blend equation) + * + 4 (fragment mask) + 1 (return) = 49 instlructions. Round up to 64 to + * make it a happy power-of-two. + */ + spe_init_func(f, 4 * 64); + + + const int frag[4] = { + spe_allocate_register(f, 3), + spe_allocate_register(f, 4), + spe_allocate_register(f, 5), + spe_allocate_register(f, 6), + }; + const int pixel[4] = { + spe_allocate_register(f, 7), + spe_allocate_register(f, 8), + spe_allocate_register(f, 9), + spe_allocate_register(f, 10), + }; + const int mask = spe_allocate_register(f, 11); + unsigned func[4]; + unsigned sF[4]; + unsigned dF[4]; + unsigned i; + int src_factor[4]; + int dst_factor[4]; + + + /* Does the selected blend mode make use of the source / destination + * color (RGB) blend factors? + */ + boolean need_color_factor = b->blend_enable + && (b->rgb_func != PIPE_BLEND_MIN) + && (b->rgb_func != PIPE_BLEND_MAX); + + /* Does the selected blend mode make use of the source / destination + * alpha blend factors? + */ + boolean need_alpha_factor = b->blend_enable + && (b->alpha_func != PIPE_BLEND_MIN) + && (b->alpha_func != PIPE_BLEND_MAX); + + + sF[0] = b->rgb_src_factor; + sF[1] = sF[0]; + sF[2] = sF[0]; + sF[3] = (b->alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) + ? PIPE_BLENDFACTOR_ONE : b->alpha_src_factor; + + dF[0] = b->rgb_dst_factor; + dF[1] = dF[0]; + dF[2] = dF[0]; + dF[3] = b->rgb_dst_factor; + + + /* If alpha writing is enabled and the alpha blend mode requires use of + * the alpha factor, calculate the alpha factor. + */ + if (((b->colormask & 8) != 0) && need_alpha_factor) { + src_factor[3] = emit_alpha_factor_calculation(f, sF[3], + blend_color->color[3], + frag[3], pixel[3]); + + /* If the alpha destination blend factor is the same as the alpha source + * blend factor, re-use the previously calculated value. + */ + dst_factor[3] = (dF[3] == sF[3]) + ? src_factor[3] + : emit_alpha_factor_calculation(f, dF[3], + blend_color->color[3], + frag[3], pixel[3]); + } + + + if (sF[0] == sF[3]) { + src_factor[0] = src_factor[3]; + src_factor[1] = src_factor[3]; + src_factor[2] = src_factor[3]; + } else if (sF[0] == dF[3]) { + src_factor[0] = dst_factor[3]; + src_factor[1] = dst_factor[3]; + src_factor[2] = dst_factor[3]; + } else if (need_color_factor) { + emit_color_factor_calculation(f, + b->rgb_src_factor, + b->colormask, + blend_color, + frag, pixel, src_factor); + } + + + if (dF[0] == sF[3]) { + dst_factor[0] = src_factor[3]; + dst_factor[1] = src_factor[3]; + dst_factor[2] = src_factor[3]; + } else if (dF[0] == dF[3]) { + dst_factor[0] = dst_factor[3]; + dst_factor[1] = dst_factor[3]; + dst_factor[2] = dst_factor[3]; + } else if (dF[0] == sF[0]) { + dst_factor[0] = src_factor[0]; + dst_factor[1] = src_factor[1]; + dst_factor[2] = src_factor[2]; + } else if (need_color_factor) { + emit_color_factor_calculation(f, + b->rgb_dst_factor, + b->colormask, + blend_color, + frag, pixel, dst_factor); + } + + + + func[0] = b->rgb_func; + func[1] = func[0]; + func[2] = func[0]; + func[3] = b->alpha_func; + + for (i = 0; i < 4; ++i) { + if ((b->colormask & (1U << i)) != 0) { + emit_blend_calculation(f, + func[i], sF[i], dF[i], + frag[i], src_factor[i], + pixel[i], dst_factor[i]); + spe_selb(f, frag[i], pixel[i], frag[i], mask); + } else { + spe_or(f, frag[i], pixel[i], pixel[i]); + } + } + + spe_bi(f, 0, 0, 0); +} diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h new file mode 100644 index 0000000000..541c3b3be0 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h @@ -0,0 +1,35 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef CELL_STATE_PER_FRAGMENT_H +#define CELL_STATE_PER_FRAGMENT_H + +extern void +cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa); + +extern void +cell_generate_alpha_blend(struct cell_blend_state *cb, + const struct pipe_blend_color *blend_color); + +#endif /* CELL_STATE_PER_FRAGMENT_H */ diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index c071de1900..115ca8cd90 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -19,6 +19,7 @@ SOURCES = \ spu_main.c \ spu_blend.c \ spu_dcache.c \ + spu_per_fragment_op.c \ spu_render.c \ spu_texture.c \ spu_tile.c \ diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 59300028d4..8e46f6e471 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -58,6 +58,9 @@ struct spu_vs_context draw; static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX] ALIGN16_ATTRIB; +static unsigned char depth_stencil_code_buffer[4 * 64] + ALIGN16_ATTRIB; + /** * Tell the PPU that this SPU has finished copying a buffer to * local store and that it may be reused by the PPU. @@ -248,14 +251,26 @@ cmd_state_blend(const struct pipe_blend_state *state) static void -cmd_state_depth_stencil(const struct pipe_depth_stencil_alpha_state *state) +cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state) { if (Debug) printf("SPU %u: DEPTH_STENCIL: ztest %d\n", spu.init.id, - state->depth.enabled); + state->read_depth); + + ASSERT_ALIGN16(state->base); + + mfc_get(depth_stencil_code_buffer, + (unsigned int) state->base, /* src */ + ROUNDUP16(state->size), + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); - memcpy(&spu.depth_stencil, state, sizeof(*state)); + spu.frag_test = (frag_test_func) depth_stencil_code_buffer; + spu.read_depth = state->read_depth; + spu.read_stencil = state->read_stencil; } @@ -415,9 +430,9 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8); break; case CELL_CMD_STATE_DEPTH_STENCIL: - cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *) + cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8); + pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8); break; case CELL_CMD_STATE_SAMPLER: cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]); diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index a13edd1702..444e218645 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -56,6 +56,17 @@ typedef union { #define TILE_STATUS_GETTING 5 /**< mfc_get() called but not yet arrived */ +struct spu_frag_test_results { + qword mask; + qword depth; + qword stencil; +}; + +typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask, + qword pixel_depth, qword pixel_stencil, qword frag_depth, + qword frag_alpha, qword facing); + + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ void *depth_start; /**< addr of depth surface in main memory */ @@ -79,8 +90,9 @@ struct spu_global struct cell_init_info init; struct spu_framebuffer fb; - struct pipe_blend_state blend_stencil; - struct pipe_depth_stencil_alpha_state depth_stencil; + boolean read_depth; + boolean read_stencil; + frag_test_func frag_test; struct pipe_blend_state blend; struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; struct cell_command_texture texture; diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c new file mode 100644 index 0000000000..d42b522b41 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -0,0 +1,191 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file spu_per_fragment_op.c + * SPU implementation various per-fragment operations. + * + * \author Ian Romanick + */ + +#include "pipe/p_format.h" +#include "spu_main.h" +#include "spu_per_fragment_op.h" + +#define ZERO 0x80 + +static void +read_ds_quad(tile_t *buffer, unsigned x, unsigned y, + enum pipe_format depth_format, qword *depth, + qword *stencil) +{ + const int ix = x / 2; + const int iy = y / 2; + + switch (depth_format) { + case PIPE_FORMAT_Z16_UNORM: { + qword *ptr = (qword *) &buffer->us8[iy][ix / 2]; + + const qword shuf_vec = (qword) { + ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3, + ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7 + }; + + + /* At even X values we want the first 4 shorts, and at odd X values we + * want the second 4 shorts. + */ + qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3)); + qword bias_mask = si_fsmbi(0x3333); + qword sv = si_a(shuf_vec, si_and(bias_mask, bias)); + + *depth = si_shufb(*ptr, *ptr, sv); + *stencil = si_il(0); + break; + } + + + case PIPE_FORMAT_Z32_UNORM: { + qword *ptr = (qword *) &buffer->ui4[iy][ix]; + + *depth = *ptr; + *stencil = si_il(0); + break; + } + + + case PIPE_FORMAT_Z24S8_UNORM: { + qword *ptr = (qword *) &buffer->ui4[iy][ix]; + qword mask = si_fsmbi(0x7777); + + *depth = si_and(*ptr, mask); + *stencil = si_rotmai(si_andc(*ptr, mask), -24); + break; + } + + + default: + assert(0); + break; + } +} + + +static void +write_ds_quad(tile_t *buffer, unsigned x, unsigned y, + enum pipe_format depth_format, + qword depth, qword stencil) +{ + const int ix = x / 2; + const int iy = y / 2; + + (void) stencil; + + switch (depth_format) { + case PIPE_FORMAT_Z16_UNORM: { + qword *ptr = (qword *) &buffer->us8[iy][ix / 2]; + + qword sv = ((ix & 0x01) == 0) + ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31 } + : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23, + 2, 3, 6, 7, 10, 11, 14, 15 }; + *ptr = si_shufb(depth, *ptr, sv); + break; + } + + + case PIPE_FORMAT_Z32_UNORM: { + qword *ptr = (qword *) &buffer->ui4[iy][ix]; + *ptr = depth; + break; + } + + + case PIPE_FORMAT_Z24S8_UNORM: { + qword *ptr = (qword *) &buffer->ui4[iy][ix]; + qword mask = si_fsmbi(0x7777); + + stencil = si_rotmai(stencil, 24); + *ptr = si_selb(stencil, depth, mask); + break; + } + + + default: + assert(0); + break; + } +} + + +qword +spu_do_depth_stencil(int x, int y, + qword frag_mask, qword frag_depth, qword frag_alpha, + qword facing) +{ + struct spu_frag_test_results result; + qword pixel_depth; + qword pixel_stencil; + + /* All of this preable code (everthing before the call to frag_test) should + * be generated on the PPU and upload to the SPU. + */ + if (spu.read_depth || spu.read_stencil) { + read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format, + &pixel_depth, &pixel_stencil); + } + + switch (spu.fb.depth_format) { + case PIPE_FORMAT_Z16_UNORM: + frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu))); + frag_depth = si_cfltu(frag_depth, 0); + break; + case PIPE_FORMAT_Z32_UNORM: + frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu))); + frag_depth = si_cfltu(frag_depth, 0); + break; + case PIPE_FORMAT_Z24S8_UNORM: + frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu))); + frag_depth = si_cfltu(frag_depth, 0); + break; + default: + assert(0); + break; + } + + result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil, + frag_depth, frag_alpha, facing); + + + /* This code (everthing after the call to frag_test) should + * be generated on the PPU and upload to the SPU. + */ + if (spu.read_depth || spu.read_stencil) { + write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format, + result.depth, result.stencil); + } + + return result.mask; +} diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h new file mode 100644 index 0000000000..6571258699 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -0,0 +1,32 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SPU_PER_FRAGMENT_OP +#define SPU_PER_FRAGMENT_OP + +extern qword +spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth, + qword frag_alpha, qword facing); + +#endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 20e77aa2e6..6df59abd36 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -98,7 +98,7 @@ my_tile(uint tx, uint ty) static INLINE void get_cz_tiles(uint tx, uint ty) { - if (spu.depth_stencil.depth.enabled) { + if (spu.read_depth) { if (spu.cur_ztile_status != TILE_STATUS_CLEAR) { //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty); get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1); @@ -153,7 +153,7 @@ static INLINE void wait_put_cz_tiles(void) { wait_on_mask(1 << TAG_WRITE_TILE_COLOR); - if (spu.depth_stencil.depth.enabled) { + if (spu.read_depth) { wait_on_mask(1 << TAG_WRITE_TILE_Z); } } diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index be9624cf7d..81823f2463 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -38,8 +38,7 @@ #include "spu_texture.h" #include "spu_tile.h" #include "spu_tri.h" - -#include "spu_ztest.h" +#include "spu_per_fragment_op.h" /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */ @@ -264,16 +263,12 @@ do_depth_test(int x, int y, mask_t quadmask) zvals.v = eval_z((float) x, (float) y); - if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) { - int ix = (x - setup.cliprect_minx) / 4; - int iy = (y - setup.cliprect_miny) / 2; - mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask); - } - else { - int ix = (x - setup.cliprect_minx) / 2; - int iy = (y - setup.cliprect_miny) / 2; - mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask); - } + mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx, + y - setup.cliprect_miny, + (qword) quadmask, + (qword) zvals.v, + (qword) spu_splats((unsigned char) 0x0ffu), + (qword) spu_splats((unsigned int) 0x01u)); if (spu_extract(spu_orx(mask), 0)) spu.cur_ztile_status = TILE_STATUS_DIRTY; @@ -299,7 +294,7 @@ emit_quad( int x, int y, mask_t mask ) sp->quad.first->run(sp->quad.first, &setup.quad); #else - if (spu.depth_stencil.depth.enabled) { + if (spu.read_depth) { mask = do_depth_test(x, y, mask); } @@ -434,7 +429,7 @@ static void flush_spans( void ) } ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); - if (spu.depth_stencil.depth.enabled) { + if (spu.read_depth) { if (spu.cur_ztile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ztile\n", spu.init.id); diff --git a/src/gallium/drivers/cell/spu/spu_ztest.h b/src/gallium/drivers/cell/spu/spu_ztest.h deleted file mode 100644 index ce8ad00339..0000000000 --- a/src/gallium/drivers/cell/spu/spu_ztest.h +++ /dev/null @@ -1,135 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -/** - * Zbuffer/depth test code. - */ - - -#ifndef SPU_ZTEST_H -#define SPU_ZTEST_H - - -#ifdef __SPU__ -#include -#endif - - - -/** - * Perform Z testing for a 16-bit/value Z buffer. - * - * \param zvals vector of four fragment zvalues as floats - * \param zbuf ptr to vector of ushort[8] zbuffer values. Note that this - * contains the Z values for 2 quads, 8 pixels. - * \param x x coordinate of quad (only lsbit is significant) - * \param inMask indicates which fragments in the quad are alive - * \return new mask indicating which fragments are alive after ztest - */ -static INLINE vector unsigned int -spu_z16_test_less(vector float zvals, vector unsigned short *zbuf, - uint x, vector unsigned int inMask) -{ -#define ZERO 0x80 - vector unsigned int zvals_ui4, zbuf_ui4, mask; - - /* convert floats to uints in [0, 65535] */ - zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */ - zvals_ui4 = spu_rlmask(zvals_ui4, -16); /* right shift 16 */ - - /* XXX this conditional could be removed with a bit of work */ - if (x & 1) { - /* convert zbuffer values from ushorts to uints */ - /* gather lower four ushorts */ - zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf, - (vector unsigned int) *zbuf, - ((vector unsigned char) { - ZERO, ZERO, 8, 9, ZERO, ZERO, 10, 11, - ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15})); - /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */ - mask = spu_cmpgt(zbuf_ui4, zvals_ui4); - /* mask &= inMask */ - mask = spu_and(mask, inMask); - /* zbuf = mask ? zval : zbuf */ - zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask); - /* convert zbuffer values from uints back to ushorts, preserve lower 4 */ - *zbuf = (vector unsigned short) - spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf, - ((vector unsigned char) { - 16, 17, 18, 19, 20, 21, 22, 23, - 2, 3, 6, 7, 10, 11, 14, 15})); - } - else { - /* convert zbuffer values from ushorts to uints */ - /* gather upper four ushorts */ - zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf, - (vector unsigned int) *zbuf, - ((vector unsigned char) { - ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3, - ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7})); - /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */ - mask = spu_cmpgt(zbuf_ui4, zvals_ui4); - /* mask &= inMask */ - mask = spu_and(mask, inMask); - /* zbuf = mask ? zval : zbuf */ - zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask); - /* convert zbuffer values from uints back to ushorts, preserve upper 4 */ - *zbuf = (vector unsigned short) - spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf, - ((vector unsigned char) { - 2, 3, 6, 7, 10, 11, 14, 15, - 24, 25, 26, 27, 28, 29, 30, 31})); - } - return mask; -#undef ZERO -} - - -/** - * As above, but Zbuffer values as 32-bit uints - */ -static INLINE vector unsigned int -spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr, - vector unsigned int inMask) -{ - vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr; - - /* convert floats to uints in [0, 0xffffffff] */ - zvals_ui4 = spu_convtu(zvals, 32); - /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */ - mask = spu_cmpgt(zbuf, zvals_ui4); - /* mask &= inMask */ - mask = spu_and(mask, inMask); - /* zbuf = mask ? zval : zbuf */ - *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask); - - return mask; -} - - -#endif /* SPU_ZTEST_H */ -- cgit v1.2.3 From 2b21bde3b1fa6fe357a3a5adc6249e89d6915524 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 19 Mar 2008 17:29:39 -0700 Subject: cell: Use code-gen for alpha blend So far this is only tested when GL_BLEND is disabled. --- src/gallium/drivers/cell/common.h | 10 +++++ src/gallium/drivers/cell/ppu/cell_pipe_state.c | 11 +++-- src/gallium/drivers/cell/ppu/cell_state_emit.c | 17 +++++-- src/gallium/drivers/cell/spu/Makefile | 1 - src/gallium/drivers/cell/spu/spu_blend.c | 62 -------------------------- src/gallium/drivers/cell/spu/spu_blend.h | 37 --------------- src/gallium/drivers/cell/spu/spu_main.c | 50 ++++++++++++++++++--- src/gallium/drivers/cell/spu/spu_main.h | 17 ++++++- src/gallium/drivers/cell/spu/spu_tri.c | 56 +++++++++++++++-------- 9 files changed, 129 insertions(+), 132 deletions(-) delete mode 100644 src/gallium/drivers/cell/spu/spu_blend.c delete mode 100644 src/gallium/drivers/cell/spu/spu_blend.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index fe93fd8e1a..d59e4f7036 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -114,6 +114,16 @@ struct cell_command_depth_stencil_alpha_test { }; +/** + * Upload code to perform framebuffer blend operation + */ +struct cell_command_blend { + uint64_t base; /**< Effective address of code start. */ + unsigned size; /**< Size in bytes of test code. */ + unsigned read_fb; /**< Flag: should framebuffer be read? */ +}; + + /** * Tell SPUs about the framebuffer size, location */ diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c index c880760e4b..4ca8c153b0 100644 --- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c +++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c @@ -54,14 +54,19 @@ cell_create_blend_state(struct pipe_context *pipe, static void -cell_bind_blend_state(struct pipe_context *pipe, void *blend) +cell_bind_blend_state(struct pipe_context *pipe, void *state) { struct cell_context *cell = cell_context(pipe); + struct cell_blend_state *blend = (struct cell_blend_state *) state; + draw_flush(cell->draw); - cell->blend = (const struct cell_blend_state *)blend; + if ((blend != NULL) && (blend->code.store == NULL)) { + cell_generate_depth_stencil_test(blend); + } + cell->blend = blend; cell->dirty |= CELL_NEW_BLEND; } @@ -70,7 +75,7 @@ static void cell_delete_blend_state(struct pipe_context *pipe, void *blend) { struct cell_blend_state *cb = (struct cell_blend_state *) blend; - + spe_release_func(& cb->code); FREE(cb); } diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 31cc938f07..5709b48f12 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -65,9 +65,20 @@ cell_emit_state(struct cell_context *cell) } if (cell->dirty & CELL_NEW_BLEND) { - emit_state_cmd(cell, CELL_CMD_STATE_BLEND, - cell->blend, - sizeof(struct pipe_blend_state)); + struct cell_command_blend blend; + + if (cell->blend != NULL) { + blend.base = (intptr_t) cell->blend->code.store; + blend.size = (char *) cell->blend->code.csr + - (char *) cell->blend->code.store; + blend.read_fb = TRUE; + } else { + blend.base = 0; + blend.size = 0; + blend.read_fb = FALSE; + } + + emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend)); } if (cell->dirty & CELL_NEW_DEPTH_STENCIL) { diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index 115ca8cd90..8e83610790 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -17,7 +17,6 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o SOURCES = \ spu_main.c \ - spu_blend.c \ spu_dcache.c \ spu_per_fragment_op.c \ spu_render.c \ diff --git a/src/gallium/drivers/cell/spu/spu_blend.c b/src/gallium/drivers/cell/spu/spu_blend.c deleted file mode 100644 index 23ec0eeb45..0000000000 --- a/src/gallium/drivers/cell/spu/spu_blend.c +++ /dev/null @@ -1,62 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include "spu_main.h" -#include "spu_blend.h" -#include "spu_colorpack.h" - - -void -blend_quad(uint itx, uint ity, vector float colors[4]) -{ - /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */ - vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]); - vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]); - vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]); - vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]); - - vector float alpha00 = spu_splats(spu_extract(colors[0], 3)); - vector float alpha01 = spu_splats(spu_extract(colors[1], 3)); - vector float alpha10 = spu_splats(spu_extract(colors[2], 3)); - vector float alpha11 = spu_splats(spu_extract(colors[3], 3)); - - vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00); - vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01); - vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10); - vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11); - - colors[0] = spu_add(spu_mul(colors[0], alpha00), - spu_mul(fbc00, one_minus_alpha00)); - colors[1] = spu_add(spu_mul(colors[1], alpha01), - spu_mul(fbc01, one_minus_alpha01)); - colors[2] = spu_add(spu_mul(colors[2], alpha10), - spu_mul(fbc10, one_minus_alpha10)); - colors[3] = spu_add(spu_mul(colors[3], alpha11), - spu_mul(fbc11, one_minus_alpha11)); -} - diff --git a/src/gallium/drivers/cell/spu/spu_blend.h b/src/gallium/drivers/cell/spu/spu_blend.h deleted file mode 100644 index 2b594b578b..0000000000 --- a/src/gallium/drivers/cell/spu/spu_blend.h +++ /dev/null @@ -1,37 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#ifndef SPU_BLEND_H -#define SPU_BLEND_H - - -extern void -blend_quad(uint itx, uint ity, vector float colors[4]); - - -#endif /* SPU_BLEND_H */ diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 937962285d..41bebf5362 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -61,6 +61,25 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX] static unsigned char depth_stencil_code_buffer[4 * 64] ALIGN16_ATTRIB; +static unsigned char fb_blend_code_buffer[4 * 64] + ALIGN16_ATTRIB; + +static struct spu_blend_results +default_blend(qword frag_r, qword frag_g, qword frag_b, qword frag_a, + qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, + qword frag_mask) +{ + struct spu_blend_results result; + + result.r = si_selb(pixel_r, frag_r, frag_mask); + result.g = si_selb(pixel_g, frag_g, frag_mask); + result.b = si_selb(pixel_b, frag_b, frag_mask); + result.a = si_selb(pixel_a, frag_a, frag_mask); + + return result; +} + + /** * Tell the PPU that this SPU has finished copying a buffer to * local store and that it may be reused by the PPU. @@ -246,14 +265,31 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) static void -cmd_state_blend(const struct pipe_blend_state *state) +cmd_state_blend(const struct cell_command_blend *state) { if (Debug) printf("SPU %u: BLEND: enabled %d\n", spu.init.id, - state->blend_enable); + (state->size != 0)); + + ASSERT_ALIGN16(state->base); - memcpy(&spu.blend, state, sizeof(*state)); + if (state->size != 0) { + mfc_get(fb_blend_code_buffer, + (unsigned int) state->base, /* src */ + ROUNDUP16(state->size), + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); + spu.blend = (blend_func) fb_blend_code_buffer; + spu.read_fb = state->read_fb; + } else { + /* If there is no code, use the default; + */ + spu.blend = default_blend; + spu.read_fb = FALSE; + } } @@ -441,9 +477,8 @@ cmd_batch(uint opcode) pos += 1; break; case CELL_CMD_STATE_BLEND: - cmd_state_blend((struct pipe_blend_state *) - &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8); + cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8); break; case CELL_CMD_STATE_DEPTH_STENCIL: cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *) @@ -587,6 +622,9 @@ one_time_init(void) memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status)); memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status)); invalidate_tex_cache(); + + spu.blend = default_blend; + spu.read_fb = FALSE; } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 444e218645..56d0968676 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -67,6 +67,18 @@ typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask, qword frag_alpha, qword facing); +struct spu_blend_results { + qword r; + qword g; + qword b; + qword a; +}; + +typedef struct spu_blend_results (*blend_func)( + qword frag_r, qword frag_g, qword frag_b, qword frag_a, + qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, + qword frag_mask); + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ void *depth_start; /**< addr of depth surface in main memory */ @@ -93,7 +105,10 @@ struct spu_global boolean read_depth; boolean read_stencil; frag_test_func frag_test; - struct pipe_blend_state blend; + + boolean read_fb; + blend_func blend; + struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; struct cell_command_texture texture; diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 81823f2463..c9f8cadcda 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -29,10 +29,10 @@ * Triangle rendering within a tile. */ +#include #include "pipe/p_compiler.h" #include "pipe/p_format.h" #include "pipe/p_util.h" -#include "spu_blend.h" #include "spu_colorpack.h" #include "spu_main.h" #include "spu_texture.h" @@ -326,27 +326,45 @@ emit_quad( int x, int y, mask_t mask ) eval_coeff(1, (float) x, (float) y, colors); } -#if 1 - if (spu.blend.blend_enable) - blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors); -#endif - if (spu_extract(mask, 0)) - spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle); - if (spu_extract(mask, 1)) - spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle); - if (spu_extract(mask, 2)) - spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle); - if (spu_extract(mask, 3)) - spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle); + /* Read the current framebuffer values. + * + * Ignore read_fb for now. In the future we can use this to avoid + * reading the framebuffer if read_fb is false and the fragment mask is + * all 0xffffffff. This is the common case, so it is probably worth + * the effort. We'll have to profile to determine whether or not the + * extra conditional branches hurt overall performance. + */ + vec_float4 aos_pix[4] = { + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]), + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]), + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]), + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]), + }; -#if 0 - /* SIMD_Z with swizzled color buffer (someday) */ - vector unsigned int uicolors = *((vector unsigned int *) &colors); - spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask); -#endif - } + qword soa_pix[4]; + qword soa_frag[4]; + /* Convert pixel and fragment data from AoS to SoA format. + */ + _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix); + _transpose_matrix4x4((vec_float4 *) soa_frag, colors); + + const struct spu_blend_results result = + (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3], + soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3], + (qword) mask); + + + /* Convert final pixel data from SoA to AoS format. + */ + _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result); + + spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle); + spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle); + spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle); + spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle); + } #endif } -- cgit v1.2.3 From 92126cea846959bb2152905a7712753d1114bd6b Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 26 Mar 2008 10:45:32 -0700 Subject: cell: Implement code-gen for logic op This also implements code-gen for the float-to-packed color conversion. It's currently hardcoded for A8R8G8B8, but that can easily be fixed as soon as other color depths are supported by the Cell driver. --- src/gallium/drivers/cell/common.h | 11 +- src/gallium/drivers/cell/ppu/cell_context.h | 2 + src/gallium/drivers/cell/ppu/cell_state_emit.c | 17 ++ .../drivers/cell/ppu/cell_state_per_fragment.c | 261 ++++++++++++++++++++- .../drivers/cell/ppu/cell_state_per_fragment.h | 4 + src/gallium/drivers/cell/spu/spu_main.c | 19 ++ src/gallium/drivers/cell/spu/spu_main.h | 9 +- src/gallium/drivers/cell/spu/spu_tri.c | 59 +++-- 8 files changed, 349 insertions(+), 33 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index d59e4f7036..b0928fefd2 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -92,8 +92,9 @@ #define CELL_CMD_STATE_BIND_VS 18 #define CELL_CMD_STATE_BLEND 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 -#define CELL_CMD_VS_EXECUTE 21 -#define CELL_CMD_FLUSH_BUFFER_RANGE 22 +#define CELL_CMD_STATE_LOGICOP 21 +#define CELL_CMD_VS_EXECUTE 22 +#define CELL_CMD_FLUSH_BUFFER_RANGE 23 #define CELL_NUM_BUFFERS 4 @@ -124,6 +125,12 @@ struct cell_command_blend { }; +struct cell_command_logicop { + uint64_t base; /**< Effective address of code start. */ + unsigned size; /**< Size in bytes of test code. */ +}; + + /** * Tell SPUs about the framebuffer size, location */ diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 9e79db0ace..0442abddc1 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -92,6 +92,8 @@ struct cell_context const struct cell_vertex_shader_state *vs; const struct cell_fragment_shader_state *fs; + struct spe_function logic_op; + struct pipe_blend_color blend_color; struct pipe_clip_state clip; struct pipe_constant_buffer constants[2]; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 5709b48f12..5c1310d0b0 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -50,6 +50,23 @@ emit_state_cmd(struct cell_context *cell, uint cmd, void cell_emit_state(struct cell_context *cell) { + if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) { + struct cell_command_logicop logicop; + + if (cell->logic_op.store != NULL) { + spe_release_func(& cell->logic_op); + } + + cell_generate_logic_op(& cell->logic_op, + & cell->blend->base, + cell->framebuffer.cbufs[0]); + + logicop.base = (intptr_t) cell->logic_op.store; + logicop.size = 64 * 4; + emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop, + sizeof(logicop)); + } + if (cell->dirty & CELL_NEW_FRAMEBUFFER) { struct pipe_surface *cbuf = cell->framebuffer.cbufs[0]; struct pipe_surface *zbuf = cell->framebuffer.zsbuf; diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c index c750b1d89d..f10025bd7c 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c @@ -977,7 +977,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb) spe_allocate_register(f, 13), spe_allocate_register(f, 14), }; - const int mask = spe_allocate_register(f, 15); unsigned func[4]; unsigned sF[4]; unsigned dF[4]; @@ -1114,9 +1113,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb) func[i], sF[i], dF[i], frag[i], src_factor[i], pixel[i], dst_factor[i]); - spe_selb(f, frag[i], pixel[i], frag[i], mask); - } else { - spe_or(f, frag[i], pixel[i], pixel[i]); } } @@ -1146,3 +1142,260 @@ cell_generate_alpha_blend(struct cell_blend_state *cb) } #endif } + + +int PC_OFFSET(const struct spe_function *f, const void *d) +{ + const intptr_t pc = (intptr_t) f->csr; + const intptr_t ea = ~0x0f & (intptr_t) d; + + return (ea - pc) >> 2; +} + + +/** + * Generate code to perform color conversion and logic op + * + * \bug + * The code generated by this function should also perform dithering. + * + * \bug + * The code generated by this function should also perform color-write + * masking. + * + * \bug + * This routine is hard-coded to only work with ARGB8 data. + */ +void +cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend, + struct pipe_surface *surf) +{ + const unsigned logic_op = (blend->logicop_enable) + ? blend->logicop_func : PIPE_LOGICOP_COPY; + + /* This code generates a maximum of 37 instructions. An additional 32 + * bytes (equiv. to 8 instructions) are needed for data storage. Round up + * to 64 to make it a happy power-of-two. + */ + spe_init_func(f, 4 * 64); + + + /* Pixel colors in framebuffer format in AoS layout. + */ + const int pixel[4] = { + spe_allocate_register(f, 3), + spe_allocate_register(f, 4), + spe_allocate_register(f, 5), + spe_allocate_register(f, 6), + }; + + /* Fragment colors stored as floats in SoA layout. + */ + const int frag[4] = { + spe_allocate_register(f, 7), + spe_allocate_register(f, 8), + spe_allocate_register(f, 9), + spe_allocate_register(f, 10), + }; + + const int mask = spe_allocate_register(f, 11); + + + /* Short-circuit the noop and invert cases. + */ + if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) { + spe_bi(f, 0, 0, 0); + return; + } else if (logic_op == PIPE_LOGICOP_INVERT) { + spe_nor(f, pixel[0], pixel[0], pixel[0]); + spe_nor(f, pixel[1], pixel[1], pixel[1]); + spe_nor(f, pixel[2], pixel[2], pixel[2]); + spe_nor(f, pixel[3], pixel[3], pixel[3]); + spe_bi(f, 0, 0, 0); + return; + } + + + const int tmp[4] = { + spe_allocate_available_register(f), + spe_allocate_available_register(f), + spe_allocate_available_register(f), + spe_allocate_available_register(f), + }; + + const int shuf_xpose_hi = spe_allocate_available_register(f); + const int shuf_xpose_lo = spe_allocate_available_register(f); + const int shuf_color = spe_allocate_available_register(f); + + + /* Pointer to the begining of the function's private data area. + */ + uint32_t *const data = ((uint32_t *) f->store) + (64 - 8); + + + /* Convert fragment colors to framebuffer format in AoS layout. + */ + data[0] = 0x00010203; + data[1] = 0x10111213; + data[2] = 0x04050607; + data[3] = 0x14151617; + + data[4] = 0x0c000408; + data[5] = 0x80808080; + data[6] = 0x80808080; + data[7] = 0x80808080; + + spe_ilh(f, tmp[0], 0x0808); + spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0)); + spe_lqr(f, shuf_color, PC_OFFSET(f, data+4)); + spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]); + + spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi); + spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo); + spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi); + spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo); + + spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi); + spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo); + spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi); + spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo); + + spe_cfltu(f, frag[0], frag[0], 32); + spe_cfltu(f, frag[1], frag[1], 32); + spe_cfltu(f, frag[2], frag[2], 32); + spe_cfltu(f, frag[3], frag[3], 32); + + spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color); + spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color); + spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color); + spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color); + + + /* If logic op is enabled, perform the requested logical operation on the + * converted fragment colors and the pixel colors. + */ + switch (logic_op) { + case PIPE_LOGICOP_CLEAR: + spe_il(f, frag[0], 0); + spe_il(f, frag[1], 0); + spe_il(f, frag[2], 0); + spe_il(f, frag[3], 0); + break; + case PIPE_LOGICOP_NOR: + spe_nor(f, frag[0], frag[0], pixel[0]); + spe_nor(f, frag[1], frag[1], pixel[1]); + spe_nor(f, frag[2], frag[2], pixel[2]); + spe_nor(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_AND_INVERTED: + spe_andc(f, frag[0], pixel[0], frag[0]); + spe_andc(f, frag[1], pixel[1], frag[1]); + spe_andc(f, frag[2], pixel[2], frag[2]); + spe_andc(f, frag[3], pixel[3], frag[3]); + break; + case PIPE_LOGICOP_COPY_INVERTED: + spe_nor(f, frag[0], frag[0], frag[0]); + spe_nor(f, frag[1], frag[1], frag[1]); + spe_nor(f, frag[2], frag[2], frag[2]); + spe_nor(f, frag[3], frag[3], frag[3]); + break; + case PIPE_LOGICOP_AND_REVERSE: + spe_andc(f, frag[0], frag[0], pixel[0]); + spe_andc(f, frag[1], frag[1], pixel[1]); + spe_andc(f, frag[2], frag[2], pixel[2]); + spe_andc(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_XOR: + spe_xor(f, frag[0], frag[0], pixel[0]); + spe_xor(f, frag[1], frag[1], pixel[1]); + spe_xor(f, frag[2], frag[2], pixel[2]); + spe_xor(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_NAND: + spe_nand(f, frag[0], frag[0], pixel[0]); + spe_nand(f, frag[1], frag[1], pixel[1]); + spe_nand(f, frag[2], frag[2], pixel[2]); + spe_nand(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_AND: + spe_and(f, frag[0], frag[0], pixel[0]); + spe_and(f, frag[1], frag[1], pixel[1]); + spe_and(f, frag[2], frag[2], pixel[2]); + spe_and(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_EQUIV: + spe_eqv(f, frag[0], frag[0], pixel[0]); + spe_eqv(f, frag[1], frag[1], pixel[1]); + spe_eqv(f, frag[2], frag[2], pixel[2]); + spe_eqv(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_OR_INVERTED: + spe_orc(f, frag[0], pixel[0], frag[0]); + spe_orc(f, frag[1], pixel[1], frag[1]); + spe_orc(f, frag[2], pixel[2], frag[2]); + spe_orc(f, frag[3], pixel[3], frag[3]); + break; + case PIPE_LOGICOP_COPY: + break; + case PIPE_LOGICOP_OR_REVERSE: + spe_orc(f, frag[0], frag[0], pixel[0]); + spe_orc(f, frag[1], frag[1], pixel[1]); + spe_orc(f, frag[2], frag[2], pixel[2]); + spe_orc(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_OR: + spe_or(f, frag[0], frag[0], pixel[0]); + spe_or(f, frag[1], frag[1], pixel[1]); + spe_or(f, frag[2], frag[2], pixel[2]); + spe_or(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_SET: + spe_il(f, frag[0], ~0); + spe_il(f, frag[1], ~0); + spe_il(f, frag[2], ~0); + spe_il(f, frag[3], ~0); + break; + + /* These two cases are short-circuited above. + */ + case PIPE_LOGICOP_INVERT: + case PIPE_LOGICOP_NOOP: + default: + assert(0); + } + + + /* Apply fragment mask. + */ + spe_ilh(f, tmp[0], 0x0000); + spe_ilh(f, tmp[1], 0x0404); + spe_ilh(f, tmp[2], 0x0808); + spe_ilh(f, tmp[3], 0x0c0c); + + spe_shufb(f, tmp[0], mask, mask, tmp[0]); + spe_shufb(f, tmp[1], mask, mask, tmp[1]); + spe_shufb(f, tmp[2], mask, mask, tmp[2]); + spe_shufb(f, tmp[3], mask, mask, tmp[3]); + + spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]); + spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]); + spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]); + spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]); + + spe_bi(f, 0, 0, 0); + +#if 0 + { + const uint32_t *p = f->store; + unsigned i; + + printf("# %u instructions\n", f->csr - f->store); + + printf("\t.text\n"); + for (i = 0; i < 64; i++) { + printf("\t.long\t0x%04x\n", p[i]); + } + fflush(stdout); + } +#endif +} diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h index f699247f9e..ab4de96c69 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h @@ -31,4 +31,8 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa); extern void cell_generate_alpha_blend(struct cell_blend_state *cb); +extern void +cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend, + struct pipe_surface *surf); + #endif /* CELL_STATE_PER_FRAGMENT_H */ diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 0a490ab277..fccff01e10 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -64,6 +64,9 @@ static unsigned char depth_stencil_code_buffer[4 * 64] static unsigned char fb_blend_code_buffer[4 * 64] ALIGN16_ATTRIB; +static unsigned char logicop_code_buffer[4 * 64] + ALIGN16_ATTRIB; + /** * Tell the PPU that this SPU has finished copying a buffer to @@ -513,6 +516,22 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); break; } + case CELL_CMD_STATE_LOGICOP: { + struct cell_command_logicop *code = + (struct cell_command_logicop *) &buffer[pos+1]; + + mfc_get(logicop_code_buffer, + (unsigned int) code->base, /* src */ + code->size, + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); + + spu.logicop = (logicop_func) logicop_code_buffer; + pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8); + break; + } case CELL_CMD_FLUSH_BUFFER_RANGE: { struct cell_buffer_range *br = (struct cell_buffer_range *) &buffer[pos+1]; diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 49f5d99674..c20452931a 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -77,9 +77,14 @@ struct spu_blend_results { typedef struct spu_blend_results (*blend_func)( qword frag_r, qword frag_g, qword frag_b, qword frag_a, qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, - qword const_r, qword const_g, qword const_b, qword const_a, + qword const_r, qword const_g, qword const_b, qword const_a); + +typedef struct spu_blend_results (*logicop_func)( + qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, + qword frag_r, qword frag_g, qword frag_b, qword frag_a, qword frag_mask); + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ void *depth_start; /**< addr of depth surface in main memory */ @@ -111,6 +116,8 @@ struct spu_global blend_func blend; qword const_blend_color[4] ALIGN16_ATTRIB; + logicop_func logicop; + struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; struct cell_command_texture texture; diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index e6a1ce01df..95c629a8aa 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -305,7 +305,6 @@ emit_quad( int x, int y, mask_t mask ) if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; const int iy = y - setup.cliprect_miny; - const vector unsigned char shuffle = spu.color_shuffle; vector float colors[4]; spu.cur_ctile_status = TILE_STATUS_DIRTY; @@ -330,45 +329,53 @@ emit_quad( int x, int y, mask_t mask ) } + /* Convert fragment data from AoS to SoA format. + */ + qword soa_frag[4]; + _transpose_matrix4x4((vec_float4 *) soa_frag, colors); + /* Read the current framebuffer values. - * - * Ignore read_fb for now. In the future we can use this to avoid - * reading the framebuffer if read_fb is false and the fragment mask is - * all 0xffffffff. This is the common case, so it is probably worth - * the effort. We'll have to profile to determine whether or not the - * extra conditional branches hurt overall performance. */ - vec_float4 aos_pix[4] = { - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]), - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]), - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]), - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]), + const qword pix[4] = { + (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]), + (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]), + (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]), + (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]), }; qword soa_pix[4]; - qword soa_frag[4]; - /* Convert pixel and fragment data from AoS to SoA format. - */ - _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix); - _transpose_matrix4x4((vec_float4 *) soa_frag, colors); + if (spu.read_fb) { + /* Convert pixel data from AoS to SoA format. + */ + vec_float4 aos_pix[4] = { + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]), + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]), + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]), + spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]), + }; + + _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix); + } - const struct spu_blend_results result = + + struct spu_blend_results result = (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3], soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3], spu.const_blend_color[0], spu.const_blend_color[1], - spu.const_blend_color[2], spu.const_blend_color[3], - (qword) mask); + spu.const_blend_color[2], spu.const_blend_color[3]); /* Convert final pixel data from SoA to AoS format. */ - _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result); - - spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle); - spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle); - spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle); - spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle); + result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3], + result.r, result.g, result.b, result.a, + (qword) mask); + + spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0); + spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0); + spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0); + spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0); } #endif } -- cgit v1.2.3 From 58b6690cf84147f88ea2ba95d2a929089e93b57f Mon Sep 17 00:00:00 2001 From: Brian Date: Mon, 31 Mar 2008 16:44:56 -0600 Subject: cell: updated comments: s/test/SPE/ --- src/gallium/drivers/cell/common.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index b0928fefd2..c9e873b35c 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -109,7 +109,7 @@ */ struct cell_command_depth_stencil_alpha_test { uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of test code. */ + unsigned size; /**< Size in bytes of SPE code. */ unsigned read_depth; /**< Flag: should depth be read? */ unsigned read_stencil; /**< Flag: should stencil be read? */ }; @@ -120,14 +120,14 @@ struct cell_command_depth_stencil_alpha_test { */ struct cell_command_blend { uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of test code. */ + unsigned size; /**< Size in bytes of SPE code. */ unsigned read_fb; /**< Flag: should framebuffer be read? */ }; struct cell_command_logicop { uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of test code. */ + unsigned size; /**< Size in bytes of SPE code. */ }; -- cgit v1.2.3 From 14452aee73e16f2ede075cf894e69d62cc539f5e Mon Sep 17 00:00:00 2001 From: Brian Date: Mon, 31 Mar 2008 17:38:21 -0600 Subject: cell: initial work to support multi-texture --- src/gallium/drivers/cell/common.h | 8 +++++-- src/gallium/drivers/cell/ppu/cell_state_emit.c | 18 +++++++--------- src/gallium/drivers/cell/spu/spu_main.c | 27 ++++++++++++++++-------- src/gallium/drivers/cell/spu/spu_main.h | 8 +++---- src/gallium/drivers/cell/spu/spu_texture.c | 29 ++++++++++++++++---------- src/gallium/drivers/cell/spu/spu_tri.c | 2 +- 6 files changed, 55 insertions(+), 37 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index c9e873b35c..298812fc20 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -66,6 +66,8 @@ #define CELL_MAX_SPUS 6 +#define CELL_MAX_SAMPLERS 4 + #define TILE_SIZE 32 @@ -228,8 +230,10 @@ struct cell_command_release_verts struct cell_command_texture { - void *start; /**< Address in main memory */ - uint width, height; + struct { + void *start; /**< Address in main memory */ + ushort width, height; + } texture[CELL_MAX_SAMPLERS]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 4c75caa025..4fbe1a21b8 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -129,17 +129,15 @@ cell_emit_state(struct cell_context *cell) if (cell->dirty & CELL_NEW_TEXTURE) { struct cell_command_texture texture; - if (cell->texture[0]) { - texture.start = cell->texture[0]->tiled_data; - texture.width = cell->texture[0]->base.width[0]; - texture.height = cell->texture[0]->base.height[0]; + uint i; + memset(&texture, 0, sizeof(texture)); + for (i = 0;i < CELL_MAX_SAMPLERS; i++) { + if (cell->texture[i]) { + texture.texture[i].start = cell->texture[i]->tiled_data; + texture.texture[i].width = cell->texture[i]->base.width[0]; + texture.texture[i].height = cell->texture[i]->base.height[0]; + } } - else { - texture.start = NULL; - texture.width = 0; - texture.height = 0; - } - emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE, &texture, sizeof(struct cell_command_texture)); } diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index d7f46f8024..80fa5f7859 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -329,17 +329,26 @@ cmd_state_sampler(const struct pipe_sampler_state *state) static void cmd_state_texture(const struct cell_command_texture *texture) { - if (Debug) - printf("SPU %u: TEXTURE at %p size %u x %u\n", - spu.init.id, texture->start, texture->width, texture->height); + uint i; + + if (1||Debug) { + printf("SPU %u: TEXTURE\n", spu.init.id); + for (i = 0; i < CELL_MAX_SAMPLERS; i++) { + printf(" %d: at %p size %u x %u\n", i, texture->texture[i].start, + texture->texture[i].width, texture->texture[i].height); + } + } memcpy(&spu.texture, texture, sizeof(*texture)); - spu.tex_size = (vector float) - { spu.texture.width, spu.texture.height, 0.0, 0.0}; - spu.tex_size_mask = (vector unsigned int) - { spu.texture.width - 1, spu.texture.height - 1, 0, 0 }; - spu.tex_size_x_mask = spu_splats(spu.texture.width - 1); - spu.tex_size_y_mask = spu_splats(spu.texture.height - 1); + for (i = 0; i < CELL_MAX_SAMPLERS; i++) { + const uint width = texture->texture[i].width; + const uint height = texture->texture[i].height; + spu.tex_size[i] = (vector float) { width, height, 0.0, 0.0}; + spu.tex_size_mask[i] = (vector unsigned int) + { width - 1, height - 1, 0, 0 }; + spu.tex_size_x_mask[i] = spu_splats(width - 1); + spu.tex_size_y_mask[i] = spu_splats(height - 1); + } } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index c20452931a..8a87787537 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -141,10 +141,10 @@ struct spu_global /** for converting RGBA to PIPE_FORMAT_x colors */ vector unsigned char color_shuffle; - vector float tex_size; - vector unsigned int tex_size_mask; /**< == int(size - 1) */ - vector unsigned int tex_size_x_mask; /**< == int(size - 1) */ - vector unsigned int tex_size_y_mask; /**< == int(size - 1) */ + vector float tex_size[CELL_MAX_SAMPLERS]; + vector unsigned int tex_size_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */ + vector unsigned int tex_size_x_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */ + vector unsigned int tex_size_y_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */ vector float (*sample_texture)(vector float texcoord); diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 67eb08196a..91a6aec5ec 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -40,25 +40,29 @@ void invalidate_tex_cache(void) { - spu_dcache_mark_dirty((unsigned) spu.texture.start, - 4 * spu.texture.width * spu.texture.height); + uint unit = 0; + uint bytes = 4 * spu.texture.texture[unit].width + * spu.texture.texture[unit].height; + + spu_dcache_mark_dirty((unsigned) spu.texture.texture[unit].start, bytes); } static uint get_texel(vec_uint4 coordinate) { + const uint unit = 0; vec_uint4 tmp; unsigned x = spu_extract(coordinate, 0); unsigned y = spu_extract(coordinate, 1); - const unsigned tiles_per_row = spu.texture.width / TILE_SIZE; + const unsigned tiles_per_row = spu.texture.texture[unit].width / TILE_SIZE; unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) + (x / TILE_SIZE)); unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE) + (x % TILE_SIZE)); spu_dcache_fetch_unaligned((qword *) & tmp, - spu.texture.start + tile_offset + texel_offset, + spu.texture.texture[unit].start + tile_offset + texel_offset, 4); return spu_extract(tmp, 0); } @@ -67,13 +71,14 @@ get_texel(vec_uint4 coordinate) static void get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels) { - const unsigned texture_ea = (uintptr_t) spu.texture.start; + const uint unit = 0; + const unsigned texture_ea = (uintptr_t) spu.texture.texture[unit].start; vec_uint4 tile_x = spu_rlmask(x, -5); vec_uint4 tile_y = spu_rlmask(y, -5); const qword offset_x = si_andi((qword) x, 0x1f); const qword offset_y = si_andi((qword) y, 0x1f); - const qword tiles_per_row = (qword) spu_splats(spu.texture.width / TILE_SIZE); + const qword tiles_per_row = (qword) spu_splats(spu.texture.texture[unit].width / TILE_SIZE); const qword tile_size = (qword) spu_splats(sizeof(tile_t)); qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x); @@ -101,9 +106,10 @@ get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels) vector float sample_texture_nearest(vector float texcoord) { - vector float tc = spu_mul(texcoord, spu.tex_size); + const uint unit = 0; + vector float tc = spu_mul(texcoord, spu.tex_size[unit]); vector unsigned int itc = spu_convtu(tc, 0); /* convert to int */ - itc = spu_and(itc, spu.tex_size_mask); /* mask (GL_REPEAT) */ + itc = spu_and(itc, spu.tex_size_mask[unit]); /* mask (GL_REPEAT) */ uint texel = get_texel(itc); return spu_unpack_A8R8G8B8(texel); } @@ -112,10 +118,11 @@ sample_texture_nearest(vector float texcoord) vector float sample_texture_bilinear(vector float texcoord) { + const uint unit = 0; static const vec_uint4 offset_x = {0, 0, 1, 1}; static const vec_uint4 offset_y = {0, 1, 0, 1}; - vector float tc = spu_mul(texcoord, spu.tex_size); + vector float tc = spu_mul(texcoord, spu.tex_size[unit]); tc = spu_add(tc, spu_splats(-0.5f)); /* half texel bias */ /* integer texcoords S,T: */ @@ -129,8 +136,8 @@ sample_texture_bilinear(vector float texcoord) x = spu_add(x, offset_x); y = spu_add(y, offset_y); - x = spu_and(x, spu.tex_size_x_mask); - y = spu_and(y, spu.tex_size_y_mask); + x = spu_and(x, spu.tex_size_x_mask[unit]); + y = spu_and(y, spu.tex_size_y_mask[unit]); get_four_texels(x, y, texels); diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 95c629a8aa..9f63317b1f 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -309,7 +309,7 @@ emit_quad( int x, int y, mask_t mask ) spu.cur_ctile_status = TILE_STATUS_DIRTY; - if (spu.texture.start) { + if (spu.texture.texture[0].start) { /* texture mapping */ vector float texcoords[4]; eval_coeff(2, (float) x, (float) y, texcoords); -- cgit v1.2.3 From e6c981f22c0b6469ef44e9d7a34113db34647fef Mon Sep 17 00:00:00 2001 From: Brian Date: Mon, 31 Mar 2008 21:09:02 -0600 Subject: cell: more work for multi-texture support --- src/gallium/drivers/cell/common.h | 17 ++++++-- src/gallium/drivers/cell/ppu/cell_state_emit.c | 31 ++++++++++----- src/gallium/drivers/cell/spu/spu_main.c | 55 +++++++++++++++----------- src/gallium/drivers/cell/spu/spu_main.h | 18 ++++++--- src/gallium/drivers/cell/spu/spu_texture.c | 24 +++++------ src/gallium/drivers/cell/spu/spu_tri.c | 2 +- 6 files changed, 90 insertions(+), 57 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 298812fc20..f430e88b9c 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -36,6 +36,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_util.h" #include "pipe/p_format.h" +#include "pipe/p_state.h" /** The standard assert macro doesn't seem to work reliably */ @@ -228,12 +229,20 @@ struct cell_command_release_verts }; +struct cell_command_sampler +{ + uint64_t opcode; /**< CELL_CMD_STATE_SAMPLER */ + uint unit; + struct pipe_sampler_state state; +}; + + struct cell_command_texture { - struct { - void *start; /**< Address in main memory */ - ushort width, height; - } texture[CELL_MAX_SAMPLERS]; + uint64_t opcode; /**< CELL_CMD_STATE_TEXTURE */ + uint unit; + void *start; /**< Address in main memory */ + ushort width, height; }; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 4fbe1a21b8..9cae67f091 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -121,25 +121,36 @@ cell_emit_state(struct cell_context *cell) } if (cell->dirty & CELL_NEW_SAMPLER) { - if (cell->sampler[0]) { - emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER, - cell->sampler[0], sizeof(struct pipe_sampler_state)); + uint i; + for (i = 0; i < CELL_MAX_SAMPLERS; i++) { + if (cell->sampler[i]) { + struct cell_command_sampler *sampler + = cell_batch_alloc(cell, sizeof(*sampler)); + sampler->opcode = CELL_CMD_STATE_SAMPLER; + sampler->unit = i; + sampler->state = *cell->sampler[i]; + } } } if (cell->dirty & CELL_NEW_TEXTURE) { - struct cell_command_texture texture; uint i; - memset(&texture, 0, sizeof(texture)); for (i = 0;i < CELL_MAX_SAMPLERS; i++) { + struct cell_command_texture *texture + = cell_batch_alloc(cell, sizeof(*texture)); + texture->opcode = CELL_CMD_STATE_TEXTURE; + texture->unit = i; if (cell->texture[i]) { - texture.texture[i].start = cell->texture[i]->tiled_data; - texture.texture[i].width = cell->texture[i]->base.width[0]; - texture.texture[i].height = cell->texture[i]->base.height[0]; + texture->start = cell->texture[i]->tiled_data; + texture->width = cell->texture[i]->base.width[0]; + texture->height = cell->texture[i]->base.height[0]; + } + else { + texture->start = NULL; + texture->width = 1; + texture->height = 1; } } - emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE, - &texture, sizeof(struct cell_command_texture)); } if (cell->dirty & CELL_NEW_VERTEX_INFO) { diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 80fa5f7859..7f0473d198 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -312,13 +312,13 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat static void -cmd_state_sampler(const struct pipe_sampler_state *state) +cmd_state_sampler(const struct cell_command_sampler *sampler) { if (Debug) - printf("SPU %u: SAMPLER\n", - spu.init.id); + printf("SPU %u: SAMPLER [%u]\n", + spu.init.id, sampler->unit); - memcpy(&spu.sampler[0], state, sizeof(*state)); + spu.sampler[sampler->unit] = sampler->state; if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR) spu.sample_texture = sample_texture_bilinear; else @@ -329,26 +329,25 @@ cmd_state_sampler(const struct pipe_sampler_state *state) static void cmd_state_texture(const struct cell_command_texture *texture) { - uint i; + const uint unit = texture->unit; + const uint width = texture->width; + const uint height = texture->height; - if (1||Debug) { - printf("SPU %u: TEXTURE\n", spu.init.id); - for (i = 0; i < CELL_MAX_SAMPLERS; i++) { - printf(" %d: at %p size %u x %u\n", i, texture->texture[i].start, - texture->texture[i].width, texture->texture[i].height); - } + if (Debug) { + printf("SPU %u: TEXTURE [%u] at %p size %u x %u\n", spu.init.id, + texture->unit, texture->start, + texture->width, texture->height); } - memcpy(&spu.texture, texture, sizeof(*texture)); - for (i = 0; i < CELL_MAX_SAMPLERS; i++) { - const uint width = texture->texture[i].width; - const uint height = texture->texture[i].height; - spu.tex_size[i] = (vector float) { width, height, 0.0, 0.0}; - spu.tex_size_mask[i] = (vector unsigned int) + spu.texture[unit].start = texture->start; + spu.texture[unit].width = width; + spu.texture[unit].height = height; + + spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0}; + spu.texture[unit].tex_size_mask = (vector unsigned int) { width - 1, height - 1, 0, 0 }; - spu.tex_size_x_mask[i] = spu_splats(width - 1); - spu.tex_size_y_mask[i] = spu_splats(height - 1); - } + spu.texture[unit].tex_size_x_mask = spu_splats(width - 1); + spu.texture[unit].tex_size_y_mask = spu_splats(height - 1); } @@ -480,12 +479,20 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8); break; case CELL_CMD_STATE_SAMPLER: - cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8); + { + struct cell_command_sampler *sampler + = (struct cell_command_sampler *) &buffer[pos]; + cmd_state_sampler(sampler); + pos += sizeof(*sampler) / 8; + } break; case CELL_CMD_STATE_TEXTURE: - cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8); + { + struct cell_command_texture *texture + = (struct cell_command_texture *) &buffer[pos]; + cmd_state_texture(texture); + pos += sizeof(*texture) / 8; + } break; case CELL_CMD_STATE_VERTEX_INFO: cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]); diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 8a87787537..2bfad3535a 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -100,6 +100,17 @@ struct spu_framebuffer { } ALIGN16_ATTRIB; +struct spu_texture +{ + void *start; + uint width, height; + vector float tex_size; + vector unsigned int tex_size_mask; /**< == int(size - 1) */ + vector unsigned int tex_size_x_mask; /**< == int(size - 1) */ + vector unsigned int tex_size_y_mask; /**< == int(size - 1) */ +} ALIGN16_ATTRIB; + + /** * All SPU global/context state will be in singleton object of this type: */ @@ -119,7 +130,7 @@ struct spu_global logicop_func logicop; struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; - struct cell_command_texture texture; + struct spu_texture texture[PIPE_MAX_SAMPLERS]; struct vertex_info vertex_info; @@ -141,11 +152,6 @@ struct spu_global /** for converting RGBA to PIPE_FORMAT_x colors */ vector unsigned char color_shuffle; - vector float tex_size[CELL_MAX_SAMPLERS]; - vector unsigned int tex_size_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */ - vector unsigned int tex_size_x_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */ - vector unsigned int tex_size_y_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */ - vector float (*sample_texture)(vector float texcoord); } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 91a6aec5ec..4612501eb3 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -41,10 +41,10 @@ void invalidate_tex_cache(void) { uint unit = 0; - uint bytes = 4 * spu.texture.texture[unit].width - * spu.texture.texture[unit].height; + uint bytes = 4 * spu.texture[unit].width + * spu.texture[unit].height; - spu_dcache_mark_dirty((unsigned) spu.texture.texture[unit].start, bytes); + spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes); } @@ -55,14 +55,14 @@ get_texel(vec_uint4 coordinate) vec_uint4 tmp; unsigned x = spu_extract(coordinate, 0); unsigned y = spu_extract(coordinate, 1); - const unsigned tiles_per_row = spu.texture.texture[unit].width / TILE_SIZE; + const unsigned tiles_per_row = spu.texture[unit].width / TILE_SIZE; unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) + (x / TILE_SIZE)); unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE) + (x % TILE_SIZE)); spu_dcache_fetch_unaligned((qword *) & tmp, - spu.texture.texture[unit].start + tile_offset + texel_offset, + spu.texture[unit].start + tile_offset + texel_offset, 4); return spu_extract(tmp, 0); } @@ -72,13 +72,13 @@ static void get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels) { const uint unit = 0; - const unsigned texture_ea = (uintptr_t) spu.texture.texture[unit].start; + const unsigned texture_ea = (uintptr_t) spu.texture[unit].start; vec_uint4 tile_x = spu_rlmask(x, -5); vec_uint4 tile_y = spu_rlmask(y, -5); const qword offset_x = si_andi((qword) x, 0x1f); const qword offset_y = si_andi((qword) y, 0x1f); - const qword tiles_per_row = (qword) spu_splats(spu.texture.texture[unit].width / TILE_SIZE); + const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].width / TILE_SIZE); const qword tile_size = (qword) spu_splats(sizeof(tile_t)); qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x); @@ -107,9 +107,9 @@ vector float sample_texture_nearest(vector float texcoord) { const uint unit = 0; - vector float tc = spu_mul(texcoord, spu.tex_size[unit]); + vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size); vector unsigned int itc = spu_convtu(tc, 0); /* convert to int */ - itc = spu_and(itc, spu.tex_size_mask[unit]); /* mask (GL_REPEAT) */ + itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */ uint texel = get_texel(itc); return spu_unpack_A8R8G8B8(texel); } @@ -122,7 +122,7 @@ sample_texture_bilinear(vector float texcoord) static const vec_uint4 offset_x = {0, 0, 1, 1}; static const vec_uint4 offset_y = {0, 1, 0, 1}; - vector float tc = spu_mul(texcoord, spu.tex_size[unit]); + vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size); tc = spu_add(tc, spu_splats(-0.5f)); /* half texel bias */ /* integer texcoords S,T: */ @@ -136,8 +136,8 @@ sample_texture_bilinear(vector float texcoord) x = spu_add(x, offset_x); y = spu_add(y, offset_y); - x = spu_and(x, spu.tex_size_x_mask[unit]); - y = spu_and(y, spu.tex_size_y_mask[unit]); + x = spu_and(x, spu.texture[unit].tex_size_x_mask); + y = spu_and(y, spu.texture[unit].tex_size_y_mask); get_four_texels(x, y, texels); diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 9f63317b1f..17e337bbdf 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -309,7 +309,7 @@ emit_quad( int x, int y, mask_t mask ) spu.cur_ctile_status = TILE_STATUS_DIRTY; - if (spu.texture.texture[0].start) { + if (spu.texture[0].start) { /* texture mapping */ vector float texcoords[4]; eval_coeff(2, (float) x, (float) y, texcoords); -- cgit v1.2.3 From 4f25420bdd834e81a3e22733304efc5261c2998a Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Sun, 24 Aug 2008 17:48:55 -0600 Subject: gallium: refactor/replace p_util.h with util/u_memory.h and util/u_math.h Also, rename p_tile.[ch] to u_tile.[ch] --- src/gallium/README.portability | 4 +- src/gallium/auxiliary/cso_cache/cso_cache.c | 3 +- src/gallium/auxiliary/cso_cache/cso_context.c | 2 +- src/gallium/auxiliary/cso_cache/cso_hash.c | 2 +- src/gallium/auxiliary/draw/draw_context.c | 3 +- src/gallium/auxiliary/draw/draw_pipe.c | 1 - src/gallium/auxiliary/draw/draw_pipe_aaline.c | 3 +- src/gallium/auxiliary/draw/draw_pipe_aapoint.c | 4 +- src/gallium/auxiliary/draw/draw_pipe_clip.c | 4 +- src/gallium/auxiliary/draw/draw_pipe_cull.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_flatshade.c | 4 +- src/gallium/auxiliary/draw/draw_pipe_offset.c | 3 +- src/gallium/auxiliary/draw/draw_pipe_pstipple.c | 4 +- src/gallium/auxiliary/draw/draw_pipe_stipple.c | 6 +- src/gallium/auxiliary/draw/draw_pipe_twoside.c | 3 +- src/gallium/auxiliary/draw/draw_pipe_unfilled.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_util.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_validate.c | 2 +- src/gallium/auxiliary/draw/draw_pipe_vbuf.c | 3 +- src/gallium/auxiliary/draw/draw_pipe_wide_line.c | 3 +- src/gallium/auxiliary/draw/draw_pipe_wide_point.c | 3 +- src/gallium/auxiliary/draw/draw_pt.c | 1 - src/gallium/auxiliary/draw/draw_pt_emit.c | 2 +- src/gallium/auxiliary/draw/draw_pt_fetch.c | 2 +- src/gallium/auxiliary/draw/draw_pt_fetch_emit.c | 2 +- .../auxiliary/draw/draw_pt_fetch_shade_emit.c | 3 +- .../auxiliary/draw/draw_pt_fetch_shade_pipeline.c | 3 +- src/gallium/auxiliary/draw/draw_pt_post_vs.c | 2 +- src/gallium/auxiliary/draw/draw_pt_util.c | 1 - src/gallium/auxiliary/draw/draw_pt_varray.c | 4 +- src/gallium/auxiliary/draw/draw_pt_vcache.c | 2 +- src/gallium/auxiliary/draw/draw_vbuf.h | 2 - src/gallium/auxiliary/draw/draw_vs.c | 6 +- src/gallium/auxiliary/draw/draw_vs_aos.c | 4 +- src/gallium/auxiliary/draw/draw_vs_aos_io.c | 2 +- src/gallium/auxiliary/draw/draw_vs_aos_machine.c | 3 +- src/gallium/auxiliary/draw/draw_vs_exec.c | 3 +- src/gallium/auxiliary/draw/draw_vs_llvm.c | 1 - src/gallium/auxiliary/draw/draw_vs_sse.c | 3 +- src/gallium/auxiliary/draw/draw_vs_varient.c | 3 +- src/gallium/auxiliary/gallivm/gallivm_cpu.cpp | 3 +- src/gallium/auxiliary/gallivm/instructions.cpp | 2 +- src/gallium/auxiliary/gallivm/instructionssoa.cpp | 2 +- .../auxiliary/pipebuffer/pb_buffer_fenced.c | 2 +- .../auxiliary/pipebuffer/pb_buffer_malloc.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c | 2 +- .../auxiliary/pipebuffer/pb_bufmgr_fenced.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_validate.c | 2 +- src/gallium/auxiliary/pipebuffer/pb_winsys.c | 2 +- src/gallium/auxiliary/rtasm/rtasm_execmem.c | 2 +- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 2 +- src/gallium/auxiliary/sct/sct.c | 2 +- src/gallium/auxiliary/tgsi/tgsi_build.c | 1 - src/gallium/auxiliary/tgsi/tgsi_build.h | 4 + src/gallium/auxiliary/tgsi/tgsi_dump_c.c | 1 - src/gallium/auxiliary/tgsi/tgsi_exec.c | 2 +- src/gallium/auxiliary/tgsi/tgsi_parse.c | 2 +- src/gallium/auxiliary/tgsi/tgsi_scan.c | 6 +- src/gallium/auxiliary/tgsi/tgsi_sse2.c | 2 +- src/gallium/auxiliary/tgsi/tgsi_transform.c | 1 + src/gallium/auxiliary/tgsi/tgsi_transform.h | 1 - src/gallium/auxiliary/tgsi/tgsi_util.c | 1 - src/gallium/auxiliary/translate/translate.c | 1 - src/gallium/auxiliary/translate/translate_cache.c | 2 +- .../auxiliary/translate/translate_generic.c | 2 +- src/gallium/auxiliary/translate/translate_sse.c | 2 +- src/gallium/auxiliary/util/Makefile | 2 +- src/gallium/auxiliary/util/SConscript | 2 +- src/gallium/auxiliary/util/p_debug.c | 1 - src/gallium/auxiliary/util/u_blit.c | 5 +- src/gallium/auxiliary/util/u_gen_mipmap.c | 2 +- src/gallium/auxiliary/util/u_handle_table.c | 4 +- src/gallium/auxiliary/util/u_hash_table.c | 5 +- src/gallium/auxiliary/util/u_math.h | 240 +++- src/gallium/auxiliary/util/u_memory.h | 222 ++++ src/gallium/auxiliary/util/u_mm.c | 2 +- src/gallium/auxiliary/util/u_pack_color.h | 36 +- src/gallium/auxiliary/util/u_pointer.h | 107 ++ src/gallium/auxiliary/util/u_rect.c | 1 - src/gallium/auxiliary/util/u_simple_shaders.c | 2 +- src/gallium/auxiliary/util/u_tile.c | 1169 ++++++++++++++++++++ src/gallium/auxiliary/util/u_tile.h | 101 ++ src/gallium/drivers/cell/common.h | 1 - src/gallium/drivers/cell/ppu/cell_clear.c | 2 +- src/gallium/drivers/cell/ppu/cell_context.c | 2 +- src/gallium/drivers/cell/ppu/cell_pipe_state.c | 2 +- src/gallium/drivers/cell/ppu/cell_render.c | 2 +- src/gallium/drivers/cell/ppu/cell_screen.c | 2 +- src/gallium/drivers/cell/ppu/cell_state_derived.c | 2 +- src/gallium/drivers/cell/ppu/cell_state_emit.c | 2 +- src/gallium/drivers/cell/ppu/cell_state_shader.c | 2 +- src/gallium/drivers/cell/ppu/cell_surface.c | 2 +- src/gallium/drivers/cell/ppu/cell_texture.c | 2 +- src/gallium/drivers/cell/ppu/cell_winsys.c | 2 +- src/gallium/drivers/cell/spu/spu_exec.c | 1 - src/gallium/drivers/cell/spu/spu_tri.c | 1 - src/gallium/drivers/cell/spu/spu_util.c | 1 - src/gallium/drivers/cell/spu/spu_vertex_fetch.c | 1 - src/gallium/drivers/cell/spu/spu_vertex_shader.c | 1 - src/gallium/drivers/failover/fo_context.c | 2 +- src/gallium/drivers/i915simple/i915_context.c | 2 +- src/gallium/drivers/i915simple/i915_debug_fp.c | 2 +- src/gallium/drivers/i915simple/i915_fpc.h | 1 - .../drivers/i915simple/i915_fpc_translate.c | 2 + src/gallium/drivers/i915simple/i915_prim_emit.c | 4 +- src/gallium/drivers/i915simple/i915_prim_vbuf.c | 3 +- src/gallium/drivers/i915simple/i915_screen.c | 2 +- src/gallium/drivers/i915simple/i915_state.c | 3 +- .../drivers/i915simple/i915_state_derived.c | 2 +- .../drivers/i915simple/i915_state_dynamic.c | 4 +- .../drivers/i915simple/i915_state_immediate.c | 2 +- .../drivers/i915simple/i915_state_sampler.c | 2 +- src/gallium/drivers/i915simple/i915_surface.c | 3 +- src/gallium/drivers/i915simple/i915_texture.c | 3 +- src/gallium/drivers/i965simple/brw_cc.c | 6 +- src/gallium/drivers/i965simple/brw_clip_state.c | 3 +- src/gallium/drivers/i965simple/brw_context.c | 2 +- src/gallium/drivers/i965simple/brw_curbe.c | 3 +- src/gallium/drivers/i965simple/brw_draw_upload.c | 1 + src/gallium/drivers/i965simple/brw_gs_state.c | 3 +- src/gallium/drivers/i965simple/brw_screen.c | 2 +- src/gallium/drivers/i965simple/brw_sf_state.c | 5 +- src/gallium/drivers/i965simple/brw_shader_info.c | 2 +- src/gallium/drivers/i965simple/brw_state.c | 2 +- src/gallium/drivers/i965simple/brw_state_batch.c | 2 +- src/gallium/drivers/i965simple/brw_state_cache.c | 2 +- src/gallium/drivers/i965simple/brw_state_pool.c | 3 +- src/gallium/drivers/i965simple/brw_state_upload.c | 2 +- src/gallium/drivers/i965simple/brw_surface.c | 3 +- src/gallium/drivers/i965simple/brw_tex_layout.c | 8 +- src/gallium/drivers/i965simple/brw_vs_state.c | 3 +- src/gallium/drivers/i965simple/brw_wm.c | 2 +- src/gallium/drivers/i965simple/brw_wm_decl.c | 3 +- src/gallium/drivers/i965simple/brw_wm_glsl.c | 3 +- .../drivers/i965simple/brw_wm_sampler_state.c | 3 +- src/gallium/drivers/i965simple/brw_wm_state.c | 3 +- src/gallium/drivers/softpipe/sp_context.c | 2 +- src/gallium/drivers/softpipe/sp_fs_exec.c | 2 +- src/gallium/drivers/softpipe/sp_fs_llvm.c | 2 +- src/gallium/drivers/softpipe/sp_fs_sse.c | 2 +- src/gallium/drivers/softpipe/sp_prim_setup.c | 2 +- src/gallium/drivers/softpipe/sp_prim_vbuf.c | 1 + src/gallium/drivers/softpipe/sp_quad_alpha_test.c | 2 +- src/gallium/drivers/softpipe/sp_quad_blend.c | 29 +- src/gallium/drivers/softpipe/sp_quad_bufloop.c | 2 +- src/gallium/drivers/softpipe/sp_quad_colormask.c | 3 +- src/gallium/drivers/softpipe/sp_quad_coverage.c | 2 +- src/gallium/drivers/softpipe/sp_quad_depth_test.c | 2 +- src/gallium/drivers/softpipe/sp_quad_earlyz.c | 2 +- src/gallium/drivers/softpipe/sp_quad_fs.c | 3 +- src/gallium/drivers/softpipe/sp_quad_occlusion.c | 2 +- src/gallium/drivers/softpipe/sp_quad_output.c | 2 +- src/gallium/drivers/softpipe/sp_quad_stencil.c | 2 +- src/gallium/drivers/softpipe/sp_quad_stipple.c | 2 +- src/gallium/drivers/softpipe/sp_query.c | 2 +- src/gallium/drivers/softpipe/sp_screen.c | 2 +- src/gallium/drivers/softpipe/sp_setup.c | 2 +- src/gallium/drivers/softpipe/sp_state_blend.c | 2 +- src/gallium/drivers/softpipe/sp_state_derived.c | 3 +- src/gallium/drivers/softpipe/sp_state_fs.c | 2 +- src/gallium/drivers/softpipe/sp_state_rasterizer.c | 2 +- src/gallium/drivers/softpipe/sp_state_sampler.c | 2 +- src/gallium/drivers/softpipe/sp_surface.c | 3 +- src/gallium/drivers/softpipe/sp_tex_sample.c | 2 +- src/gallium/drivers/softpipe/sp_texture.c | 3 +- src/gallium/drivers/softpipe/sp_tile_cache.c | 4 +- src/gallium/drivers/trace/tr_context.c | 2 +- src/gallium/drivers/trace/tr_dump.c | 2 + src/gallium/drivers/trace/tr_dump.h | 1 - src/gallium/drivers/trace/tr_screen.c | 2 +- src/gallium/drivers/trace/tr_state.c | 1 + src/gallium/drivers/trace/tr_stream_stdc.c | 2 +- src/gallium/drivers/trace/tr_stream_wd.c | 2 +- src/gallium/drivers/trace/tr_texture.c | 2 +- src/gallium/drivers/trace/tr_winsys.c | 3 +- src/gallium/include/pipe/p_util.h | 460 -------- src/gallium/state_trackers/python/gallium.i | 2 +- src/gallium/state_trackers/python/st_device.c | 3 +- src/gallium/state_trackers/python/st_sample.c | 5 +- .../state_trackers/python/st_softpipe_winsys.c | 3 +- .../winsys/drm/intel/common/intel_be_device.c | 2 +- .../winsys/drm/intel/dri/intel_winsys_softpipe.c | 2 +- src/gallium/winsys/egl_xlib/egl_xlib.c | 2 +- src/gallium/winsys/egl_xlib/sw_winsys.c | 3 +- src/gallium/winsys/gdi/wmesa.c | 2 +- src/gallium/winsys/xlib/brw_aub.c | 1 - src/gallium/winsys/xlib/xm_winsys.c | 3 +- src/gallium/winsys/xlib/xm_winsys_aub.c | 2 +- src/mesa/state_tracker/acc2.c | 319 ++++++ src/mesa/state_tracker/st_cb_accum.c | 2 +- src/mesa/state_tracker/st_cb_bitmap.c | 2 +- src/mesa/state_tracker/st_cb_drawpixels.c | 2 +- src/mesa/state_tracker/st_cb_readpixels.c | 2 +- src/mesa/state_tracker/st_cb_texture.c | 2 +- src/mesa/state_tracker/st_program.c | 2 +- src/mesa/state_tracker/st_texture.c | 1 - 201 files changed, 2453 insertions(+), 686 deletions(-) create mode 100644 src/gallium/auxiliary/util/u_memory.h create mode 100644 src/gallium/auxiliary/util/u_pointer.h create mode 100644 src/gallium/auxiliary/util/u_tile.c create mode 100644 src/gallium/auxiliary/util/u_tile.h delete mode 100644 src/gallium/include/pipe/p_util.h create mode 100644 src/mesa/state_tracker/acc2.c (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/README.portability b/src/gallium/README.portability index d5d5987a7f..adecf4bb79 100644 --- a/src/gallium/README.portability +++ b/src/gallium/README.portability @@ -35,8 +35,8 @@ not available in Windows Kernel Mode. Use the appropriate p_*.h include. * Use MALLOC, CALLOC, FREE instead of the malloc, calloc, free functions. -* Use align_pointer() function defined in p_util.h for aligning pointers in a -portable way. +* Use align_pointer() function defined in u_memory.h for aligning pointers + in a portable way. == Debugging == diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c index 36dc46ff80..6b1754ea00 100644 --- a/src/gallium/auxiliary/cso_cache/cso_cache.c +++ b/src/gallium/auxiliary/cso_cache/cso_cache.c @@ -28,9 +28,10 @@ /* Authors: Zack Rusin */ -#include "pipe/p_util.h" #include "pipe/p_debug.h" +#include "util/u_memory.h" + #include "cso_cache.h" #include "cso_hash.h" diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c index 86e4d46a20..f22ba40824 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.c +++ b/src/gallium/auxiliary/cso_cache/cso_context.c @@ -36,7 +36,7 @@ */ #include "pipe/p_state.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c index 0646efd952..7f0044c5a7 100644 --- a/src/gallium/auxiliary/cso_cache/cso_hash.c +++ b/src/gallium/auxiliary/cso_cache/cso_hash.c @@ -31,7 +31,7 @@ */ #include "pipe/p_debug.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "cso_hash.h" diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 2f263cf06a..1c26cb31a3 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -31,7 +31,8 @@ */ -#include "pipe/p_util.h" +#include "util/u_memory.h" +#include "util/u_math.h" #include "draw_context.h" #include "draw_vbuf.h" #include "draw_vs.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c index 1db43876ef..3cde9d36d3 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.c +++ b/src/gallium/auxiliary/draw/draw_pipe.c @@ -30,7 +30,6 @@ * Keith Whitwell */ -#include "pipe/p_util.h" #include "draw/draw_private.h" #include "draw/draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index 991304b2c8..20841bb5d6 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -32,11 +32,12 @@ */ -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "tgsi/tgsi_transform.h" #include "tgsi/tgsi_dump.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c index c7f4349cb3..2c1cacbdb4 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c @@ -38,7 +38,6 @@ */ -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" @@ -47,6 +46,9 @@ #include "tgsi/tgsi_transform.h" #include "tgsi/tgsi_dump.h" +#include "util/u_math.h" +#include "util/u_memory.h" + #include "draw_context.h" #include "draw_vs.h" #include "draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index fa10f8efca..3265dcd154 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -32,7 +32,9 @@ */ -#include "pipe/p_util.h" +#include "util/u_memory.h" +#include "util/u_math.h" + #include "pipe/p_shader_tokens.h" #include "draw_vs.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c index d0d22a38e0..053be5f050 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_cull.c +++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c @@ -33,7 +33,7 @@ */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_defines.h" #include "draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c index 4741b22d02..43d1fecc4d 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c +++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c @@ -28,7 +28,9 @@ /* Authors: Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" + #include "pipe/p_shader_tokens.h" #include "draw_vs.h" #include "draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c index 2f5865741c..1fea5e6dcb 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_offset.c +++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c @@ -32,7 +32,8 @@ * \author Brian Paul */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c index e97136fa1f..b764d9c518 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c @@ -34,12 +34,14 @@ */ -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" +#include "util/u_math.h" +#include "util/u_memory.h" + #include "tgsi/tgsi_transform.h" #include "tgsi/tgsi_dump.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index bf0db18a68..b65e2aa102 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -36,10 +36,12 @@ */ -#include "pipe/p_util.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" -#include "draw_pipe.h" +#include "util/u_math.h" +#include "util/u_memory.h" + +#include "draw/draw_pipe.h" /** Subclass of draw_stage */ diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c index 3ac825f565..c329d92339 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c +++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c @@ -28,7 +28,8 @@ /* Authors: Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" #include "draw_vs.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c index 8f97fdedaa..68835fd1a5 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c +++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c @@ -33,7 +33,7 @@ /* Authors: Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_defines.h" #include "draw_private.h" #include "draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_util.c b/src/gallium/auxiliary/draw/draw_pipe_util.c index 04438f4dd0..e22e5fed0c 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_util.c +++ b/src/gallium/auxiliary/draw/draw_pipe_util.c @@ -30,7 +30,7 @@ * Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "draw/draw_private.h" #include "draw/draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c index 6be1d369c3..f34c68728e 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_validate.c +++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c @@ -28,7 +28,7 @@ /* Authors: Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_defines.h" #include "draw_private.h" #include "draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c index a6fde77a0e..c0cf4269db 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c +++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c @@ -35,7 +35,8 @@ #include "pipe/p_debug.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "draw_vbuf.h" #include "draw_private.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c index 48ec2f1239..184e363594 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c +++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c @@ -28,9 +28,10 @@ /* Authors: Keith Whitwell */ -#include "pipe/p_util.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "draw_private.h" #include "draw_pipe.h" diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c index 54590984c6..4f1326053d 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c +++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c @@ -28,7 +28,8 @@ /* Authors: Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" #include "draw_vs.h" diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 85a75525c8..669c11c993 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -30,7 +30,6 @@ * Keith Whitwell */ -#include "pipe/p_util.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index 40f05cb9e0..d4eca80588 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_vbuf.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c index 07f4c99164..6377f896fb 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_vbuf.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index 4a1f3b0953..0684c93d10 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -30,7 +30,7 @@ * Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_vbuf.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index fdf9b6fe6a..87094f3092 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -31,7 +31,8 @@ */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_vbuf.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index be3535ed9e..f617aac9f7 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -25,7 +25,8 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "draw/draw_context.h" #include "draw/draw_vbuf.h" #include "draw/draw_vertex.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c index af6306b1c6..96dc706b99 100644 --- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c +++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_context.h" #include "draw/draw_context.h" #include "draw/draw_private.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c index 32c8a9632c..3bc7939c55 100644 --- a/src/gallium/auxiliary/draw/draw_pt_util.c +++ b/src/gallium/auxiliary/draw/draw_pt_util.c @@ -30,7 +30,6 @@ * Keith Whitwell */ -#include "pipe/p_util.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c index 46e722a154..c15afe65f1 100644 --- a/src/gallium/auxiliary/draw/draw_pt_varray.c +++ b/src/gallium/auxiliary/draw/draw_pt_varray.c @@ -25,7 +25,9 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" + #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c index cda2987c9e..b8b5de729d 100644 --- a/src/gallium/auxiliary/draw/draw_pt_vcache.c +++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c @@ -30,7 +30,7 @@ * Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h index e90f37872a..62247ccd9f 100644 --- a/src/gallium/auxiliary/draw/draw_vbuf.h +++ b/src/gallium/auxiliary/draw/draw_vbuf.h @@ -37,8 +37,6 @@ #define DRAW_VBUF_H_ -#include "pipe/p_util.h" - struct draw_context; struct vertex_info; diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index f798b20492..34adbd49b0 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -31,11 +31,15 @@ * Brian Paul */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" + #include "pipe/p_shader_tokens.h" + #include "draw_private.h" #include "draw_context.h" #include "draw_vs.h" + #include "translate/translate.h" #include "translate/translate_cache.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 41bdd012d5..760fcb389f 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -29,9 +29,9 @@ */ -#include "pipe/p_util.h" -#include "pipe/p_shader_tokens.h" +#include "util/u_memory.h" #include "util/u_math.h" +#include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_exec.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c index eda677cc62..ab3c5b94a5 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c index e029b7b4bb..b358bd2df4 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c @@ -29,8 +29,9 @@ #include "pipe/p_config.h" -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_exec.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index e26903d8cc..44563803f9 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -31,7 +31,8 @@ * Brian Paul */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "draw_private.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c index fc03473b91..2ce30b9a02 100644 --- a/src/gallium/auxiliary/draw/draw_vs_llvm.c +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -32,7 +32,6 @@ * Brian Paul */ -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "draw_private.h" #include "draw_context.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index 61f0c084c3..0efabd9de8 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -31,13 +31,14 @@ * Brian Paul */ +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_config.h" #include "draw_vs.h" #if defined(PIPE_ARCH_X86) -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "draw_private.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index 994ce3e889..4daf05dae7 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -30,7 +30,8 @@ * Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" +#include "util/u_math.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_vbuf.h" diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp index cf5b978837..e64bfb1c6c 100644 --- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp +++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp @@ -41,11 +41,12 @@ #include "pipe/p_context.h" #include "pipe/p_shader_tokens.h" -#include "pipe/p_util.h" #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_dump.h" +#include "util/u_memory.h" + #include #include #include diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp index 035224e8f3..a82dc30306 100644 --- a/src/gallium/auxiliary/gallivm/instructions.cpp +++ b/src/gallium/auxiliary/gallivm/instructions.cpp @@ -35,7 +35,7 @@ #include "storage.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include #include diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp index 76049ade7c..efddc04e81 100644 --- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp +++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp @@ -29,7 +29,7 @@ #include "storagesoa.h" #include "pipe/p_shader_tokens.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include #include diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c index ce41418a0f..8ae052e875 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c @@ -45,7 +45,7 @@ #include "pipe/p_debug.h" #include "pipe/p_winsys.h" #include "pipe/p_thread.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_double_list.h" #include "pb_buffer.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c index e90d2e5623..20fc87b39d 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c @@ -35,7 +35,7 @@ #include "pipe/p_debug.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pb_buffer.h" #include "pb_bufmgr.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c index 0d2d6c0c1b..2afaeafa1a 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c @@ -35,7 +35,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_debug.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pb_buffer.h" #include "pb_bufmgr.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c index bed4bec4fe..b914c2d0fe 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c @@ -38,7 +38,7 @@ #include "pipe/p_debug.h" #include "pipe/p_winsys.h" #include "pipe/p_thread.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_double_list.h" #include "util/u_time.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c index d02e3500ff..5e518370d0 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c @@ -37,7 +37,7 @@ #include "pipe/p_debug.h" #include "pipe/p_winsys.h" #include "pipe/p_thread.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_double_list.h" #include "util/u_time.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c index 05efd8ce41..8fc63ce648 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c @@ -35,7 +35,7 @@ #include "pipe/p_debug.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pb_buffer.h" #include "pb_buffer_fenced.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c index c51e582611..b40eb6cc90 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c @@ -36,7 +36,7 @@ #include "pipe/p_defines.h" #include "pipe/p_debug.h" #include "pipe/p_thread.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_double_list.h" #include "util/u_mm.h" #include "pb_buffer.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c index 95af08929a..93d2cc9635 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c @@ -39,7 +39,7 @@ #include "pipe/p_debug.h" #include "pipe/p_thread.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_double_list.h" #include "pb_buffer.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c index 598d9ce310..af307e265a 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c @@ -39,7 +39,7 @@ #include "pipe/p_debug.h" #include "pipe/p_thread.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_double_list.h" #include "util/u_time.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c index 362fd896f3..1e54fc39d4 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_validate.c +++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c @@ -35,7 +35,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_error.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_debug.h" #include "pb_buffer.h" diff --git a/src/gallium/auxiliary/pipebuffer/pb_winsys.c b/src/gallium/auxiliary/pipebuffer/pb_winsys.c index 978944091f..28d137dbc4 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_winsys.c +++ b/src/gallium/auxiliary/pipebuffer/pb_winsys.c @@ -35,7 +35,7 @@ #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pb_buffer.h" diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c index 300c1c2d9d..dfa5c35ab6 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c +++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c @@ -33,7 +33,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_debug.h" #include "pipe/p_thread.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "rtasm_execmem.h" diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index 7f6bf577b2..285ddc0e3f 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -30,7 +30,7 @@ */ #include "pipe/p_compiler.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "rtasm_ppc_spe.h" #ifdef GALLIUM_CELL diff --git a/src/gallium/auxiliary/sct/sct.c b/src/gallium/auxiliary/sct/sct.c index 5e4126e014..49bb7ea92e 100644 --- a/src/gallium/auxiliary/sct/sct.c +++ b/src/gallium/auxiliary/sct/sct.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_state.h" #include "pipe/p_inlines.h" #include "sct.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c index 050b448fe7..74614d3688 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c @@ -26,7 +26,6 @@ **************************************************************************/ #include "pipe/p_debug.h" -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "tgsi_build.h" #include "tgsi_parse.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h index 6ae7f324f8..7d6234746a 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.h +++ b/src/gallium/auxiliary/tgsi/tgsi_build.h @@ -28,6 +28,10 @@ #ifndef TGSI_BUILD_H #define TGSI_BUILD_H + +struct tgsi_token; + + #if defined __cplusplus extern "C" { #endif diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c index 1025866a25..be25cb45a0 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c @@ -26,7 +26,6 @@ **************************************************************************/ #include "pipe/p_debug.h" -#include "pipe/p_util.h" #include "util/u_string.h" #include "tgsi_dump_c.h" #include "tgsi_build.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index e28b56c842..fb573fe1f0 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -52,11 +52,11 @@ #include "pipe/p_compiler.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" +#include "util/u_memory.h" #include "util/u_math.h" #define FAST_MATH 1 diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c index d16f0cdcad..3757486ba9 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.c +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c @@ -26,10 +26,10 @@ **************************************************************************/ #include "pipe/p_debug.h" -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "tgsi_parse.h" #include "tgsi_build.h" +#include "util/u_memory.h" void tgsi_full_token_init( diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index 59bcf10b53..be4870a498 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -33,11 +33,11 @@ */ -#include "tgsi_scan.h" -#include "tgsi/tgsi_parse.h" +#include "util/u_math.h" #include "tgsi/tgsi_build.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" -#include "pipe/p_util.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 00ed4da450..626724ad4e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "pipe/p_debug.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c index 357f77b05a..ea87da31e5 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_transform.c +++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c @@ -31,6 +31,7 @@ * Authors: Brian Paul */ +#include "pipe/p_debug.h" #include "tgsi_transform.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h index 3da0b38271..a121adbaef 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_transform.h +++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h @@ -29,7 +29,6 @@ #define TGSI_TRANSFORM_H -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_build.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c index 09486e649e..50101a9bb0 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.c +++ b/src/gallium/auxiliary/tgsi/tgsi_util.c @@ -26,7 +26,6 @@ **************************************************************************/ #include "pipe/p_debug.h" -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "tgsi_parse.h" #include "tgsi_build.h" diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c index b93fbf9033..7678903f75 100644 --- a/src/gallium/auxiliary/translate/translate.c +++ b/src/gallium/auxiliary/translate/translate.c @@ -31,7 +31,6 @@ */ #include "pipe/p_config.h" -#include "pipe/p_util.h" #include "pipe/p_state.h" #include "translate.h" diff --git a/src/gallium/auxiliary/translate/translate_cache.c b/src/gallium/auxiliary/translate/translate_cache.c index 115dc9287e..d8069a149c 100644 --- a/src/gallium/auxiliary/translate/translate_cache.c +++ b/src/gallium/auxiliary/translate/translate_cache.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_state.h" #include "translate.h" #include "translate_cache.h" diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index 4c8179ffa8..4d336f47ea 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -30,7 +30,7 @@ * Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_state.h" #include "translate.h" diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index 18a212ac1c..7955186e16 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -28,7 +28,7 @@ #include "pipe/p_config.h" #include "pipe/p_compiler.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_simple_list.h" #include "translate.h" diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile index 6eebf6d29b..6e5fd26c05 100644 --- a/src/gallium/auxiliary/util/Makefile +++ b/src/gallium/auxiliary/util/Makefile @@ -5,7 +5,6 @@ LIBNAME = util C_SOURCES = \ p_debug.c \ - p_tile.c \ u_blit.c \ u_draw_quad.c \ u_gen_mipmap.c \ @@ -16,6 +15,7 @@ C_SOURCES = \ u_rect.c \ u_simple_shaders.c \ u_snprintf.c \ + u_tile.c \ u_time.c include ../../Makefile.template diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript index 94382fe1f9..ce3fad7068 100644 --- a/src/gallium/auxiliary/util/SConscript +++ b/src/gallium/auxiliary/util/SConscript @@ -6,7 +6,6 @@ util = env.ConvenienceLibrary( 'p_debug.c', 'p_debug_mem.c', 'p_debug_prof.c', - 'p_tile.c', 'u_blit.c', 'u_draw_quad.c', 'u_gen_mipmap.c', @@ -17,6 +16,7 @@ util = env.ConvenienceLibrary( 'u_rect.c', 'u_simple_shaders.c', 'u_snprintf.c', + 'u_tile.c', 'u_time.c', ]) diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c index 2c2f2f8931..7d1dba5a24 100644 --- a/src/gallium/auxiliary/util/p_debug.c +++ b/src/gallium/auxiliary/util/p_debug.c @@ -51,7 +51,6 @@ #endif #include "pipe/p_compiler.h" -#include "pipe/p_util.h" #include "pipe/p_debug.h" #include "pipe/p_format.h" #include "pipe/p_state.h" diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index ae087df4cf..05399f9885 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -37,12 +37,13 @@ #include "pipe/p_debug.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "pipe/p_winsys.h" #include "pipe/p_shader_tokens.h" -#include "util/u_draw_quad.h" #include "util/u_blit.h" +#include "util/u_draw_quad.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "util/u_simple_shaders.h" #include "cso_cache/cso_context.h" diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c index 8713ff5d58..c1e2c19f87 100644 --- a/src/gallium/auxiliary/util/u_gen_mipmap.c +++ b/src/gallium/auxiliary/util/u_gen_mipmap.c @@ -37,10 +37,10 @@ #include "pipe/p_debug.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "pipe/p_winsys.h" #include "pipe/p_shader_tokens.h" +#include "util/u_memory.h" #include "util/u_draw_quad.h" #include "util/u_gen_mipmap.h" #include "util/u_simple_shaders.h" diff --git a/src/gallium/auxiliary/util/u_handle_table.c b/src/gallium/auxiliary/util/u_handle_table.c index 2176a00959..2c40011923 100644 --- a/src/gallium/auxiliary/util/u_handle_table.c +++ b/src/gallium/auxiliary/util/u_handle_table.c @@ -35,9 +35,9 @@ #include "pipe/p_compiler.h" #include "pipe/p_debug.h" -#include "pipe/p_util.h" -#include "u_handle_table.h" +#include "util/u_memory.h" +#include "util/u_handle_table.h" #define HANDLE_TABLE_INITIAL_SIZE 16 diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c index dd5eca7fca..0bc8de9632 100644 --- a/src/gallium/auxiliary/util/u_hash_table.c +++ b/src/gallium/auxiliary/util/u_hash_table.c @@ -40,10 +40,11 @@ #include "pipe/p_compiler.h" #include "pipe/p_debug.h" -#include "pipe/p_util.h" #include "cso_cache/cso_hash.h" -#include "u_hash_table.h" + +#include "util/u_memory.h" +#include "util/u_hash_table.h" struct hash_table diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index a541d30a5d..9b4ca39371 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -40,8 +40,6 @@ #include "pipe/p_compiler.h" -#include "pipe/p_util.h" -#include "util/u_math.h" #ifdef __cplusplus @@ -49,6 +47,132 @@ extern "C" { #endif +#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) +__inline double ceil(double val) +{ + double ceil_val; + + if((val - (long) val) == 0) { + ceil_val = val; + } + else { + if(val > 0) { + ceil_val = (long) val + 1; + } + else { + ceil_val = (long) val; + } + } + + return ceil_val; +} + +#ifndef PIPE_SUBSYSTEM_WINDOWS_CE +__inline double floor(double val) +{ + double floor_val; + + if((val - (long) val) == 0) { + floor_val = val; + } + else { + if(val > 0) { + floor_val = (long) val; + } + else { + floor_val = (long) val - 1; + } + } + + return floor_val; +} +#endif + +#pragma function(pow) +__inline double __cdecl pow(double val, double exponent) +{ + /* XXX */ + assert(0); + return 0; +} + +#pragma function(log) +__inline double __cdecl log(double val) +{ + /* XXX */ + assert(0); + return 0; +} + +#pragma function(atan2) +__inline double __cdecl atan2(double val) +{ + /* XXX */ + assert(0); + return 0; +} +#else +#include +#include +#endif + + +#if defined(_MSC_VER) +#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE) + +static INLINE float cosf( float f ) +{ + return (float) cos( (double) f ); +} + +static INLINE float sinf( float f ) +{ + return (float) sin( (double) f ); +} + +static INLINE float ceilf( float f ) +{ + return (float) ceil( (double) f ); +} + +static INLINE float floorf( float f ) +{ + return (float) floor( (double) f ); +} + +static INLINE float powf( float f, float g ) +{ + return (float) pow( (double) f, (double) g ); +} + +static INLINE float sqrtf( float f ) +{ + return (float) sqrt( (double) f ); +} + +static INLINE float fabsf( float f ) +{ + return (float) fabs( (double) f ); +} + +static INLINE float logf( float f ) +{ + return (float) log( (double) f ); +} + +#else +/* Work-around an extra semi-colon in VS 2005 logf definition */ +#ifdef logf +#undef logf +#define logf(x) ((float)log((double)(x))) +#endif /* logf */ +#endif +#endif /* _MSC_VER */ + + + + + #define POW2_TABLE_SIZE 256 #define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1)) extern float pow2_table[POW2_TABLE_SIZE]; @@ -59,6 +183,11 @@ extern void util_init_math(void); +union fi { + float f; + int i; + unsigned ui; +}; /** @@ -195,6 +324,113 @@ util_iround(float f) +#if defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86) +/** + * Find first bit set in word. Least significant bit is 1. + * Return 0 if no bits set. + */ +static INLINE +unsigned ffs( unsigned u ) +{ + unsigned i; + + if( u == 0 ) { + return 0; + } + + __asm bsf eax, [u] + __asm inc eax + __asm mov [i], eax + + return i; +} +#endif + + +/** + * Return float bits. + */ +static INLINE unsigned +fui( float f ) +{ + union fi fi; + fi.f = f; + return fi.ui; +} + + + +static INLINE float +ubyte_to_float(ubyte ub) +{ + return (float) ub * (1.0f / 255.0f); +} + + +/** + * Convert float in [0,1] to ubyte in [0,255] with clamping. + */ +static INLINE ubyte +float_to_ubyte(float f) +{ + const int ieee_0996 = 0x3f7f0000; /* 0.996 or so */ + union fi tmp; + + tmp.f = f; + if (tmp.i < 0) { + return (ubyte) 0; + } + else if (tmp.i >= ieee_0996) { + return (ubyte) 255; + } + else { + tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f; + return (ubyte) tmp.i; + } +} + + + +#define CLAMP( X, MIN, MAX ) ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) ) + +#define MIN2( A, B ) ( (A)<(B) ? (A) : (B) ) +#define MAX2( A, B ) ( (A)>(B) ? (A) : (B) ) + + +static INLINE int +align(int value, int alignment) +{ + return (value + alignment - 1) & ~(alignment - 1); +} + + +#ifndef COPY_4V +#define COPY_4V( DST, SRC ) \ +do { \ + (DST)[0] = (SRC)[0]; \ + (DST)[1] = (SRC)[1]; \ + (DST)[2] = (SRC)[2]; \ + (DST)[3] = (SRC)[3]; \ +} while (0) +#endif + + +#ifndef COPY_4FV +#define COPY_4FV( DST, SRC ) COPY_4V(DST, SRC) +#endif + + +#ifndef ASSIGN_4V +#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \ +do { \ + (DST)[0] = (V0); \ + (DST)[1] = (V1); \ + (DST)[2] = (V2); \ + (DST)[3] = (V3); \ +} while (0) +#endif + + #ifdef __cplusplus } #endif diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h new file mode 100644 index 0000000000..148a5cb997 --- /dev/null +++ b/src/gallium/auxiliary/util/u_memory.h @@ -0,0 +1,222 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * Memory functions + */ + + +#ifndef U_MEMORY_H +#define U_MEMORY_H + + +#include "util/u_pointer.h" + + + /* Define ENOMEM for WINCE */ +#if (_WIN32_WCE < 600) +#ifndef ENOMEM +#define ENOMEM 12 +#endif +#endif + + + +#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) + +/* memory debugging */ + +#include "p_debug.h" + +#define MALLOC( _size ) \ + debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size ) +#define CALLOC( _count, _size ) \ + debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size ) +#define FREE( _ptr ) \ + debug_free( __FILE__, __LINE__, __FUNCTION__, _ptr ) +#define REALLOC( _ptr, _old_size, _size ) \ + debug_realloc( __FILE__, __LINE__, __FUNCTION__, _ptr, _old_size, _size ) + +#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) + +void * __stdcall +EngAllocMem( + unsigned long Flags, + unsigned long MemSize, + unsigned long Tag ); + +void __stdcall +EngFreeMem( + void *Mem ); + +#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' ) +#define _FREE( _ptr ) EngFreeMem( _ptr ) + +#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) + +void * +ExAllocatePool( + unsigned long PoolType, + size_t NumberOfBytes); + +void +ExFreePool(void *P); + +#define MALLOC(_size) ExAllocatePool(0, _size) +#define _FREE(_ptr) ExFreePool(_ptr) + +#else + +#define MALLOC( SIZE ) malloc( SIZE ) +#define CALLOC( COUNT, SIZE ) calloc( COUNT, SIZE ) +#define FREE( PTR ) free( PTR ) +#define REALLOC( OLDPTR, OLDSIZE, NEWSIZE ) realloc( OLDPTR, NEWSIZE ) + +#endif + + +#ifndef CALLOC +static INLINE void * +CALLOC( unsigned count, unsigned size ) +{ + void *ptr = MALLOC( count * size ); + if( ptr ) { + memset( ptr, 0, count * size ); + } + return ptr; +} +#endif /* !CALLOC */ + +#ifndef FREE +static INLINE void +FREE( void *ptr ) +{ + if( ptr ) { + _FREE( ptr ); + } +} +#endif /* !FREE */ + +#ifndef REALLOC +static INLINE void * +REALLOC( void *old_ptr, unsigned old_size, unsigned new_size ) +{ + void *new_ptr = NULL; + + if (new_size != 0) { + unsigned copy_size = old_size < new_size ? old_size : new_size; + new_ptr = MALLOC( new_size ); + if (new_ptr && old_ptr && copy_size) { + memcpy( new_ptr, old_ptr, copy_size ); + } + } + + FREE( old_ptr ); + return new_ptr; +} +#endif /* !REALLOC */ + + +#define MALLOC_STRUCT(T) (struct T *) MALLOC(sizeof(struct T)) + +#define CALLOC_STRUCT(T) (struct T *) CALLOC(1, sizeof(struct T)) + + +/** + * Return memory on given byte alignment + */ +static INLINE void * +align_malloc(size_t bytes, uint alignment) +{ +#if defined(HAVE_POSIX_MEMALIGN) + void *mem; + alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1); + if(posix_memalign(& mem, alignment, bytes) != 0) + return NULL; + return mem; +#else + char *ptr, *buf; + + assert( alignment > 0 ); + + ptr = (char *) MALLOC(bytes + alignment + sizeof(void *)); + if (!ptr) + return NULL; + + buf = (char *) align_pointer( ptr + sizeof(void *), alignment ); + *(char **)(buf - sizeof(void *)) = ptr; + + return buf; +#endif /* defined(HAVE_POSIX_MEMALIGN) */ +} + +/** + * Free memory returned by align_malloc(). + */ +static INLINE void +align_free(void *ptr) +{ +#if defined(HAVE_POSIX_MEMALIGN) + FREE(ptr); +#else + void **cubbyHole = (void **) ((char *) ptr - sizeof(void *)); + void *realAddr = *cubbyHole; + FREE(realAddr); +#endif /* defined(HAVE_POSIX_MEMALIGN) */ +} + + +/** + * Duplicate a block of memory. + */ +static INLINE void * +mem_dup(const void *src, uint size) +{ + void *dup = MALLOC(size); + if (dup) + memcpy(dup, src, size); + return dup; +} + + +/** + * Number of elements in an array. + */ +#ifndef Elements +#define Elements(x) (sizeof(x)/sizeof((x)[0])) +#endif + + +/** + * Offset of a field in a struct, in bytes. + */ +#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER)) + + + +#endif /* U_MEMORY_H */ diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c index b49ae074e0..0f51dd5977 100644 --- a/src/gallium/auxiliary/util/u_mm.c +++ b/src/gallium/auxiliary/util/u_mm.c @@ -24,9 +24,9 @@ #include "pipe/p_compiler.h" -#include "pipe/p_util.h" #include "pipe/p_debug.h" +#include "util/u_memory.h" #include "util/u_mm.h" diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h index 06abb34d5a..39e4ae9d07 100644 --- a/src/gallium/auxiliary/util/u_pack_color.h +++ b/src/gallium/auxiliary/util/u_pack_color.h @@ -37,6 +37,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_format.h" +#include "util/u_math.h" /** @@ -150,10 +151,10 @@ util_pack_color(const float rgba[4], enum pipe_format format, void *dest) if (pf_size_x(format) <= 8) { /* format uses 8-bit components or less */ - UNCLAMPED_FLOAT_TO_UBYTE(r, rgba[0]); - UNCLAMPED_FLOAT_TO_UBYTE(g, rgba[1]); - UNCLAMPED_FLOAT_TO_UBYTE(b, rgba[2]); - UNCLAMPED_FLOAT_TO_UBYTE(a, rgba[3]); + r = float_to_ubyte(rgba[0]); + g = float_to_ubyte(rgba[1]); + b = float_to_ubyte(rgba[2]); + a = float_to_ubyte(rgba[3]); } switch (format) { @@ -286,4 +287,31 @@ util_pack_z(enum pipe_format format, double z) } +/** + * Pack 4 ubytes into a 4-byte word + */ +static INLINE unsigned +pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3) +{ + return ((((unsigned int)b0) << 0) | + (((unsigned int)b1) << 8) | + (((unsigned int)b2) << 16) | + (((unsigned int)b3) << 24)); +} + + +/** + * Pack/convert 4 floats into one 4-byte word. + */ +static INLINE unsigned +pack_ui32_float4(float a, float b, float c, float d) +{ + return pack_ub4( float_to_ubyte(a), + float_to_ubyte(b), + float_to_ubyte(c), + float_to_ubyte(d) ); +} + + + #endif /* U_PACK_COLOR_H */ diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h new file mode 100644 index 0000000000..e1af9f11cb --- /dev/null +++ b/src/gallium/auxiliary/util/u_pointer.h @@ -0,0 +1,107 @@ +/************************************************************************** + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_POINTER_H +#define U_POINTER_H + +#include "pipe/p_compiler.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE intptr_t +pointer_to_intptr( const void *p ) +{ + union { + const void *p; + intptr_t i; + } pi; + pi.p = p; + return pi.i; +} + +static INLINE void * +intptr_to_pointer( intptr_t i ) +{ + union { + void *p; + intptr_t i; + } pi; + pi.i = i; + return pi.p; +} + +static INLINE uintptr_t +pointer_to_uintptr( const void *ptr ) +{ + union { + const void *p; + uintptr_t u; + } pu; + pu.p = ptr; + return pu.u; +} + +static INLINE void * +uintptr_to_pointer( uintptr_t u ) +{ + union { + void *p; + uintptr_t u; + } pu; + pu.u = u; + return pu.p; +} + +/** + * Return a pointer aligned to next multiple of N bytes. + */ +static INLINE void * +align_pointer( const void *unaligned, uintptr_t alignment ) +{ + uintptr_t aligned = (pointer_to_uintptr( unaligned ) + alignment - 1) & ~(alignment - 1); + return uintptr_to_pointer( aligned ); +} + + +/** + * Return a pointer aligned to next multiple of 16 bytes. + */ +static INLINE void * +align16( void *unaligned ) +{ + return align_pointer( unaligned, 16 ); +} + + + +#ifdef __cplusplus +} +#endif + +#endif /* U_POINTER_H */ diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c index 94e447b9d5..b31ab5415f 100644 --- a/src/gallium/auxiliary/util/u_rect.c +++ b/src/gallium/auxiliary/util/u_rect.c @@ -31,7 +31,6 @@ #include "pipe/p_defines.h" -#include "pipe/p_util.h" #include "pipe/p_format.h" #include "util/u_rect.h" diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c index c34fb6ee33..f06d13c2c4 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.c +++ b/src/gallium/auxiliary/util/u_simple_shaders.c @@ -37,10 +37,10 @@ #include "pipe/p_debug.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "pipe/p_winsys.h" #include "pipe/p_shader_tokens.h" +#include "util/u_memory.h" #include "util/u_simple_shaders.h" #include "tgsi/tgsi_build.h" diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c new file mode 100644 index 0000000000..853c503f4f --- /dev/null +++ b/src/gallium/auxiliary/util/u_tile.c @@ -0,0 +1,1169 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * RGBA/float tile get/put functions. + * Usable both by drivers and state trackers. + * Surfaces should already be in a mapped state. + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_inlines.h" + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_rect.h" +#include "util/u_tile.h" + + +/** + * Move raw block of pixels from surface to user memory. + * This should be usable by any hw driver that has mappable surfaces. + */ +void +pipe_get_tile_raw(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + void *dst, int dst_stride) +{ + const void *src; + + if (dst_stride == 0) + dst_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size; + + if (pipe_clip_tile(x, y, &w, &h, ps)) + return; + + src = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ); + assert(src); + if(!src) + return; + + pipe_copy_rect(dst, &ps->block, dst_stride, 0, 0, w, h, src, ps->stride, x, y); + + pipe_surface_unmap(ps); +} + + +/** + * Move raw block of pixels from user memory to surface. + * This should be usable by any hw driver that has mappable surfaces. + */ +void +pipe_put_tile_raw(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + const void *src, int src_stride) +{ + void *dst; + + if (src_stride == 0) + src_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size; + + if (pipe_clip_tile(x, y, &w, &h, ps)) + return; + + dst = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE); + assert(dst); + if(!dst) + return; + + pipe_copy_rect(dst, &ps->block, ps->stride, x, y, w, h, src, src_stride, 0, 0); + + pipe_surface_unmap(ps); +} + + + + +/** Convert short in [-32768,32767] to GLfloat in [-1.0,1.0] */ +#define SHORT_TO_FLOAT(S) ((2.0F * (S) + 1.0F) * (1.0F/65535.0F)) + +#define UNCLAMPED_FLOAT_TO_SHORT(us, f) \ + us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) ) + + + +/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/ + +static void +a8r8g8b8_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + const unsigned pixel = *src++; + pRow[0] = ubyte_to_float((pixel >> 16) & 0xff); + pRow[1] = ubyte_to_float((pixel >> 8) & 0xff); + pRow[2] = ubyte_to_float((pixel >> 0) & 0xff); + pRow[3] = ubyte_to_float((pixel >> 24) & 0xff); + } + p += dst_stride; + } +} + + +static void +a8r8g8b8_put_tile_rgba(unsigned *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r, g, b, a; + r = float_to_ubyte(pRow[0]); + g = float_to_ubyte(pRow[1]); + b = float_to_ubyte(pRow[2]); + a = float_to_ubyte(pRow[3]); + *dst++ = (a << 24) | (r << 16) | (g << 8) | b; + } + p += src_stride; + } +} + + +/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/ + +static void +x8r8g8b8_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + const unsigned pixel = *src++; + pRow[0] = ubyte_to_float((pixel >> 16) & 0xff); + pRow[1] = ubyte_to_float((pixel >> 8) & 0xff); + pRow[2] = ubyte_to_float((pixel >> 0) & 0xff); + pRow[3] = ubyte_to_float(0xff); + } + p += dst_stride; + } +} + + +static void +x8r8g8b8_put_tile_rgba(unsigned *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r, g, b; + r = float_to_ubyte(pRow[0]); + g = float_to_ubyte(pRow[1]); + b = float_to_ubyte(pRow[2]); + *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b; + } + p += src_stride; + } +} + + +/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/ + +static void +b8g8r8a8_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + const unsigned pixel = *src++; + pRow[0] = ubyte_to_float((pixel >> 8) & 0xff); + pRow[1] = ubyte_to_float((pixel >> 16) & 0xff); + pRow[2] = ubyte_to_float((pixel >> 24) & 0xff); + pRow[3] = ubyte_to_float((pixel >> 0) & 0xff); + } + p += dst_stride; + } +} + + +static void +b8g8r8a8_put_tile_rgba(unsigned *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r, g, b, a; + r = float_to_ubyte(pRow[0]); + g = float_to_ubyte(pRow[1]); + b = float_to_ubyte(pRow[2]); + a = float_to_ubyte(pRow[3]); + *dst++ = (b << 24) | (g << 16) | (r << 8) | a; + } + p += src_stride; + } +} + + +/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/ + +static void +a1r5g5b5_get_tile_rgba(const ushort *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + const ushort pixel = *src++; + pRow[0] = ((pixel >> 10) & 0x1f) * (1.0f / 31.0f); + pRow[1] = ((pixel >> 5) & 0x1f) * (1.0f / 31.0f); + pRow[2] = ((pixel ) & 0x1f) * (1.0f / 31.0f); + pRow[3] = ((pixel >> 15) ) * 1.0f; + } + p += dst_stride; + } +} + + +static void +a1r5g5b5_put_tile_rgba(ushort *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r, g, b, a; + r = float_to_ubyte(pRow[0]); + g = float_to_ubyte(pRow[1]); + b = float_to_ubyte(pRow[2]); + a = float_to_ubyte(pRow[3]); + r = r >> 3; /* 5 bits */ + g = g >> 3; /* 5 bits */ + b = b >> 3; /* 5 bits */ + a = a >> 7; /* 1 bit */ + *dst++ = (a << 15) | (r << 10) | (g << 5) | b; + } + p += src_stride; + } +} + + +/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/ + +static void +a4r4g4b4_get_tile_rgba(const ushort *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + const ushort pixel = *src++; + pRow[0] = ((pixel >> 8) & 0xf) * (1.0f / 15.0f); + pRow[1] = ((pixel >> 4) & 0xf) * (1.0f / 15.0f); + pRow[2] = ((pixel ) & 0xf) * (1.0f / 15.0f); + pRow[3] = ((pixel >> 12) ) * (1.0f / 15.0f); + } + p += dst_stride; + } +} + + +static void +a4r4g4b4_put_tile_rgba(ushort *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r, g, b, a; + r = float_to_ubyte(pRow[0]); + g = float_to_ubyte(pRow[1]); + b = float_to_ubyte(pRow[2]); + a = float_to_ubyte(pRow[3]); + r >>= 4; + g >>= 4; + b >>= 4; + a >>= 4; + *dst++ = (a << 12) | (r << 16) | (g << 4) | b; + } + p += src_stride; + } +} + + +/*** PIPE_FORMAT_R5G6B5_UNORM ***/ + +static void +r5g6b5_get_tile_rgba(const ushort *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + const ushort pixel = *src++; + pRow[0] = ((pixel >> 11) & 0x1f) * (1.0f / 31.0f); + pRow[1] = ((pixel >> 5) & 0x3f) * (1.0f / 63.0f); + pRow[2] = ((pixel ) & 0x1f) * (1.0f / 31.0f); + pRow[3] = 1.0f; + } + p += dst_stride; + } +} + + +static void +r5g6b5_put_tile_rgba(ushort *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + uint r = (uint) (CLAMP(pRow[0], 0.0, 1.0) * 31.0); + uint g = (uint) (CLAMP(pRow[1], 0.0, 1.0) * 63.0); + uint b = (uint) (CLAMP(pRow[2], 0.0, 1.0) * 31.0); + *dst++ = (r << 11) | (g << 5) | (b); + } + p += src_stride; + } +} + + + +/*** PIPE_FORMAT_Z16_UNORM ***/ + +/** + * Return each Z value as four floats in [0,1]. + */ +static void +z16_get_tile_rgba(const ushort *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + const float scale = 1.0f / 65535.0f; + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = *src++ * scale; + } + p += dst_stride; + } +} + + + + +/*** PIPE_FORMAT_L8_UNORM ***/ + +static void +l8_get_tile_rgba(const ubyte *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, src++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = ubyte_to_float(*src); + pRow[3] = 1.0; + } + p += dst_stride; + } +} + + +static void +l8_put_tile_rgba(ubyte *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r; + r = float_to_ubyte(pRow[0]); + *dst++ = r; + } + p += src_stride; + } +} + + + +/*** PIPE_FORMAT_A8_UNORM ***/ + +static void +a8_get_tile_rgba(const ubyte *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, src++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = 0.0; + pRow[3] = ubyte_to_float(*src); + } + p += dst_stride; + } +} + + +static void +a8_put_tile_rgba(ubyte *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned a; + a = float_to_ubyte(pRow[3]); + *dst++ = a; + } + p += src_stride; + } +} + + + +/*** PIPE_FORMAT_R16_SNORM ***/ + +static void +r16_get_tile_rgba(const short *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, src++, pRow += 4) { + pRow[0] = SHORT_TO_FLOAT(src[0]); + pRow[1] = + pRow[2] = 0.0; + pRow[3] = 1.0; + } + p += dst_stride; + } +} + + +static void +r16_put_tile_rgba(short *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, dst++, pRow += 4) { + UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]); + } + p += src_stride; + } +} + + +/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/ + +static void +r16g16b16a16_get_tile_rgba(const short *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, src += 4, pRow += 4) { + pRow[0] = SHORT_TO_FLOAT(src[0]); + pRow[1] = SHORT_TO_FLOAT(src[1]); + pRow[2] = SHORT_TO_FLOAT(src[2]); + pRow[3] = SHORT_TO_FLOAT(src[3]); + } + p += dst_stride; + } +} + + +static void +r16g16b16a16_put_tile_rgba(short *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, dst += 4, pRow += 4) { + UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]); + UNCLAMPED_FLOAT_TO_SHORT(dst[1], pRow[1]); + UNCLAMPED_FLOAT_TO_SHORT(dst[2], pRow[2]); + UNCLAMPED_FLOAT_TO_SHORT(dst[3], pRow[3]); + } + p += src_stride; + } +} + + + +/*** PIPE_FORMAT_I8_UNORM ***/ + +static void +i8_get_tile_rgba(const ubyte *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, src++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = ubyte_to_float(*src); + } + p += dst_stride; + } +} + + +static void +i8_put_tile_rgba(ubyte *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r; + r = float_to_ubyte(pRow[0]); + *dst++ = r; + } + p += src_stride; + } +} + + +/*** PIPE_FORMAT_A8L8_UNORM ***/ + +static void +a8l8_get_tile_rgba(const ushort *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + ushort p = *src++; + pRow[0] = + pRow[1] = + pRow[2] = ubyte_to_float(p & 0xff); + pRow[3] = ubyte_to_float(p >> 8); + } + p += dst_stride; + } +} + + +static void +a8l8_put_tile_rgba(ushort *dst, + unsigned w, unsigned h, + const float *p, + unsigned src_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + const float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + unsigned r, a; + r = float_to_ubyte(pRow[0]); + a = float_to_ubyte(pRow[3]); + *dst++ = (a << 8) | r; + } + p += src_stride; + } +} + + + + +/*** PIPE_FORMAT_Z32_UNORM ***/ + +/** + * Return each Z value as four floats in [0,1]. + */ +static void +z32_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + const double scale = 1.0 / (double) 0xffffffff; + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = (float) (*src++ * scale); + } + p += dst_stride; + } +} + + +/*** PIPE_FORMAT_S8Z24_UNORM ***/ + +/** + * Return Z component as four float in [0,1]. Stencil part ignored. + */ +static void +s8z24_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + const double scale = 1.0 / ((1 << 24) - 1); + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = (float) (scale * (*src++ & 0xffffff)); + } + p += dst_stride; + } +} + + +/*** PIPE_FORMAT_Z24S8_UNORM ***/ + +/** + * Return Z component as four float in [0,1]. Stencil part ignored. + */ +static void +z24s8_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + const double scale = 1.0 / ((1 << 24) - 1); + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = (float) (scale * (*src++ >> 8)); + } + p += dst_stride; + } +} + + +/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/ + +/** + * Convert YCbCr (or YCrCb) to RGBA. + */ +static void +ycbcr_get_tile_rgba(const ushort *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride, + boolean rev) +{ + const float scale = 1.0f / 255.0f; + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + /* do two texels at a time */ + for (j = 0; j < (w & ~1); j += 2, src += 2) { + const ushort t0 = src[0]; + const ushort t1 = src[1]; + const ubyte y0 = (t0 >> 8) & 0xff; /* luminance */ + const ubyte y1 = (t1 >> 8) & 0xff; /* luminance */ + ubyte cb, cr; + float r, g, b; + + if (rev) { + cb = t1 & 0xff; /* chroma U */ + cr = t0 & 0xff; /* chroma V */ + } + else { + cb = t0 & 0xff; /* chroma U */ + cr = t1 & 0xff; /* chroma V */ + } + + /* even pixel: y0,cr,cb */ + r = 1.164f * (y0-16) + 1.596f * (cr-128); + g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128); + b = 1.164f * (y0-16) + 2.018f * (cb-128); + pRow[0] = r * scale; + pRow[1] = g * scale; + pRow[2] = b * scale; + pRow[3] = 1.0f; + pRow += 4; + + /* odd pixel: use y1,cr,cb */ + r = 1.164f * (y1-16) + 1.596f * (cr-128); + g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128); + b = 1.164f * (y1-16) + 2.018f * (cb-128); + pRow[0] = r * scale; + pRow[1] = g * scale; + pRow[2] = b * scale; + pRow[3] = 1.0f; + pRow += 4; + + } + /* do the last texel */ + if (w & 1) { + const ushort t0 = src[0]; + const ushort t1 = src[1]; + const ubyte y0 = (t0 >> 8) & 0xff; /* luminance */ + ubyte cb, cr; + float r, g, b; + + if (rev) { + cb = t1 & 0xff; /* chroma U */ + cr = t0 & 0xff; /* chroma V */ + } + else { + cb = t0 & 0xff; /* chroma U */ + cr = t1 & 0xff; /* chroma V */ + } + + /* even pixel: y0,cr,cb */ + r = 1.164f * (y0-16) + 1.596f * (cr-128); + g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128); + b = 1.164f * (y0-16) + 2.018f * (cb-128); + pRow[0] = r * scale; + pRow[1] = g * scale; + pRow[2] = b * scale; + pRow[3] = 1.0f; + pRow += 4; + } + p += dst_stride; + } +} + + +void +pipe_tile_raw_to_rgba(enum pipe_format format, + void *src, + uint w, uint h, + float *dst, unsigned dst_stride) +{ + switch (format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_X8R8G8B8_UNORM: + x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_A1R5G5B5_UNORM: + a1r5g5b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_A4R4G4B4_UNORM: + a4r4g4b4_get_tile_rgba((ushort *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_R5G6B5_UNORM: + r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_L8_UNORM: + l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_A8_UNORM: + a8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_I8_UNORM: + i8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_A8L8_UNORM: + a8l8_get_tile_rgba((ushort *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_R16_SNORM: + r16_get_tile_rgba((short *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_R16G16B16A16_SNORM: + r16g16b16a16_get_tile_rgba((short *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_Z16_UNORM: + z16_get_tile_rgba((ushort *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_Z32_UNORM: + z32_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_Z24S8_UNORM: + z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_YCBCR: + ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE); + break; + case PIPE_FORMAT_YCBCR_REV: + ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE); + break; + default: + assert(0); + } +} + + +void +pipe_get_tile_rgba(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + float *p) +{ + unsigned dst_stride = w * 4; + void *packed; + + if (pipe_clip_tile(x, y, &w, &h, ps)) + return; + + packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size); + + if (!packed) + return; + + if(ps->format == PIPE_FORMAT_YCBCR || ps->format == PIPE_FORMAT_YCBCR_REV) + assert((x & 1) == 0); + + pipe_get_tile_raw(ps, x, y, w, h, packed, 0); + + pipe_tile_raw_to_rgba(ps->format, packed, w, h, p, dst_stride); + + FREE(packed); +} + + +void +pipe_put_tile_rgba(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + const float *p) +{ + unsigned src_stride = w * 4; + void *packed; + + if (pipe_clip_tile(x, y, &w, &h, ps)) + return; + + packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size); + + if (!packed) + return; + + switch (ps->format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_X8R8G8B8_UNORM: + x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_A1R5G5B5_UNORM: + a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_R5G6B5_UNORM: + r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_R8G8B8A8_UNORM: + assert(0); + break; + case PIPE_FORMAT_A4R4G4B4_UNORM: + a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_L8_UNORM: + l8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_A8_UNORM: + a8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_I8_UNORM: + i8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_A8L8_UNORM: + a8l8_put_tile_rgba((ushort *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_R16_SNORM: + r16_put_tile_rgba((short *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_R16G16B16A16_SNORM: + r16g16b16a16_put_tile_rgba((short *) packed, w, h, p, src_stride); + break; + case PIPE_FORMAT_Z16_UNORM: + /*z16_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/ + break; + case PIPE_FORMAT_Z32_UNORM: + /*z32_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/ + break; + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/ + break; + case PIPE_FORMAT_Z24S8_UNORM: + /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/ + break; + default: + assert(0); + } + + pipe_put_tile_raw(ps, x, y, w, h, packed, 0); + + FREE(packed); +} + + +/** + * Get a block of Z values, converted to 32-bit range. + */ +void +pipe_get_tile_z(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + uint *z) +{ + const uint dstStride = w; + ubyte *map; + uint *pDest = z; + uint i, j; + + if (pipe_clip_tile(x, y, &w, &h, ps)) + return; + + map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ); + if (!map) { + assert(0); + return; + } + + switch (ps->format) { + case PIPE_FORMAT_Z32_UNORM: + { + const uint *pSrc + = (const uint *)(map + y * ps->stride + x*4); + for (i = 0; i < h; i++) { + memcpy(pDest, pSrc, 4 * w); + pDest += dstStride; + pSrc += ps->stride/4; + } + } + break; + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + { + const uint *pSrc + = (const uint *)(map + y * ps->stride + x*4); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + /* convert 24-bit Z to 32-bit Z */ + pDest[j] = (pSrc[j] << 8) | (pSrc[j] & 0xff); + } + pDest += dstStride; + pSrc += ps->stride/4; + } + } + break; + case PIPE_FORMAT_Z16_UNORM: + { + const ushort *pSrc + = (const ushort *)(map + y * ps->stride + x*2); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + /* convert 16-bit Z to 32-bit Z */ + pDest[j] = (pSrc[j] << 16) | pSrc[j]; + } + pDest += dstStride; + pSrc += ps->stride/2; + } + } + break; + default: + assert(0); + } + + pipe_surface_unmap(ps); +} + + +void +pipe_put_tile_z(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + const uint *zSrc) +{ + const uint srcStride = w; + const uint *pSrc = zSrc; + ubyte *map; + uint i, j; + + if (pipe_clip_tile(x, y, &w, &h, ps)) + return; + + map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE); + if (!map) { + assert(0); + return; + } + + switch (ps->format) { + case PIPE_FORMAT_Z32_UNORM: + { + uint *pDest = (uint *) (map + y * ps->stride + x*4); + for (i = 0; i < h; i++) { + memcpy(pDest, pSrc, 4 * w); + pDest += ps->stride/4; + pSrc += srcStride; + } + } + break; + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + { + uint *pDest = (uint *) (map + y * ps->stride + x*4); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + /* convert 32-bit Z to 24-bit Z (0 stencil) */ + pDest[j] = pSrc[j] >> 8; + } + pDest += ps->stride/4; + pSrc += srcStride; + } + } + break; + case PIPE_FORMAT_Z16_UNORM: + { + ushort *pDest = (ushort *) (map + y * ps->stride + x*2); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + /* convert 32-bit Z to 16-bit Z */ + pDest[j] = pSrc[j] >> 16; + } + pDest += ps->stride/2; + pSrc += srcStride; + } + } + break; + default: + assert(0); + } + + pipe_surface_unmap(ps); +} + + diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h new file mode 100644 index 0000000000..a8ac805308 --- /dev/null +++ b/src/gallium/auxiliary/util/u_tile.h @@ -0,0 +1,101 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef P_TILE_H +#define P_TILE_H + +#include "pipe/p_compiler.h" + +struct pipe_surface; + + +/** + * Clip tile against surface dims. + * \return TRUE if tile is totally clipped, FALSE otherwise + */ +static INLINE boolean +pipe_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_surface *ps) +{ + if (x >= ps->width) + return TRUE; + if (y >= ps->height) + return TRUE; + if (x + *w > ps->width) + *w = ps->width - x; + if (y + *h > ps->height) + *h = ps->height - y; + return FALSE; +} + +#ifdef __cplusplus +extern "C" { +#endif + +void +pipe_get_tile_raw(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + void *p, int dst_stride); + +void +pipe_put_tile_raw(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + const void *p, int src_stride); + + +void +pipe_get_tile_rgba(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + float *p); + +void +pipe_put_tile_rgba(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + const float *p); + + +void +pipe_get_tile_z(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + uint *z); + +void +pipe_put_tile_z(struct pipe_surface *ps, + uint x, uint y, uint w, uint h, + const uint *z); + +void +pipe_tile_raw_to_rgba(enum pipe_format format, + void *src, + uint w, uint h, + float *dst, unsigned dst_stride); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index f430e88b9c..6bace0bb11 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -34,7 +34,6 @@ #define CELL_COMMON_H #include "pipe/p_compiler.h" -#include "pipe/p_util.h" #include "pipe/p_format.h" #include "pipe/p_state.h" diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c index 3ffe09add6..cee0917b63 100644 --- a/src/gallium/drivers/cell/ppu/cell_clear.c +++ b/src/gallium/drivers/cell/ppu/cell_clear.c @@ -34,7 +34,7 @@ #include #include #include "pipe/p_inlines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "cell/common.h" #include "cell_clear.h" #include "cell_context.h" diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 12eb5aa254..5af95a3c10 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -35,7 +35,7 @@ #include "pipe/p_defines.h" #include "pipe/p_format.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_winsys.h" #include "pipe/p_screen.h" diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c index 67b87f16d7..971d65d09e 100644 --- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c +++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c @@ -30,7 +30,7 @@ * Brian Paul */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "draw/draw_context.h" #include "cell_context.h" diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c index b663b37622..dd25ae880e 100644 --- a/src/gallium/drivers/cell/ppu/cell_render.c +++ b/src/gallium/drivers/cell/ppu/cell_render.c @@ -33,7 +33,7 @@ #include "cell_context.h" #include "cell_render.h" #include "cell_spu.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "draw/draw_private.h" diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c index 2bf441a0c5..139b3719b6 100644 --- a/src/gallium/drivers/cell/ppu/cell_screen.c +++ b/src/gallium/drivers/cell/ppu/cell_screen.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_winsys.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c index 5480534ad9..8ab938a02a 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_derived.c +++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "draw/draw_context.h" #include "draw/draw_vertex.h" diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 9cae67f091..3646a0ee4f 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "cell_context.h" #include "cell_state.h" #include "cell_state_emit.h" diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c index f5707f2bb8..cd96b317fa 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_shader.c +++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c @@ -26,7 +26,7 @@ **************************************************************************/ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" #include "draw/draw_context.h" diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c index 01ffa31c2c..2d31ad89a6 100644 --- a/src/gallium/drivers/cell/ppu/cell_surface.c +++ b/src/gallium/drivers/cell/ppu/cell_surface.c @@ -26,7 +26,7 @@ **************************************************************************/ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" #include "util/p_tile.h" diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 533b64227d..1add81373d 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -33,7 +33,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_winsys.h" #include "cell_context.h" diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.c b/src/gallium/drivers/cell/ppu/cell_winsys.c index ebabce3c8f..d570bbd2f9 100644 --- a/src/gallium/drivers/cell/ppu/cell_winsys.c +++ b/src/gallium/drivers/cell/ppu/cell_winsys.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "cell_winsys.h" diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c index 42e5022f30..89c61136a4 100644 --- a/src/gallium/drivers/cell/spu/spu_exec.c +++ b/src/gallium/drivers/cell/spu/spu_exec.c @@ -63,7 +63,6 @@ #include "pipe/p_compiler.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index ab4ff8160a..8944ef171e 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -32,7 +32,6 @@ #include #include "pipe/p_compiler.h" #include "pipe/p_format.h" -#include "pipe/p_util.h" #include "spu_colorpack.h" #include "spu_main.h" #include "spu_texture.h" diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c index 74ab2bbd1f..dbcf4b0eb9 100644 --- a/src/gallium/drivers/cell/spu/spu_util.c +++ b/src/gallium/drivers/cell/spu/spu_util.c @@ -1,4 +1,3 @@ -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" //#include "tgsi_build.h" diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c index 219fd90cc0..26f2363749 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c @@ -32,7 +32,6 @@ * Ian Romanick */ -#include "pipe/p_util.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" #include "spu_exec.h" diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c index 3119a78c06..a1e81975e6 100644 --- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c @@ -34,7 +34,6 @@ #include -#include "pipe/p_util.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" #include "spu_vertex_shader.h" diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c index 014a3e31d5..10c4ffc209 100644 --- a/src/gallium/drivers/failover/fo_context.c +++ b/src/gallium/drivers/failover/fo_context.c @@ -28,7 +28,7 @@ #include "pipe/p_defines.h" #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_context.h" #include "fo_context.h" diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c index e2bf5ab678..c6776716a2 100644 --- a/src/gallium/drivers/i915simple/i915_context.c +++ b/src/gallium/drivers/i915simple/i915_context.c @@ -35,7 +35,7 @@ #include "draw/draw_context.h" #include "pipe/p_defines.h" #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_screen.h" diff --git a/src/gallium/drivers/i915simple/i915_debug_fp.c b/src/gallium/drivers/i915simple/i915_debug_fp.c index c024a051a5..48be3e1472 100644 --- a/src/gallium/drivers/i915simple/i915_debug_fp.c +++ b/src/gallium/drivers/i915simple/i915_debug_fp.c @@ -29,7 +29,7 @@ #include "i915_reg.h" #include "i915_debug.h" #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" static void diff --git a/src/gallium/drivers/i915simple/i915_fpc.h b/src/gallium/drivers/i915simple/i915_fpc.h index 80a9576304..2f0f99d046 100644 --- a/src/gallium/drivers/i915simple/i915_fpc.h +++ b/src/gallium/drivers/i915simple/i915_fpc.h @@ -29,7 +29,6 @@ #ifndef I915_FPC_H #define I915_FPC_H -#include "pipe/p_util.h" #include "i915_context.h" #include "i915_reg.h" diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c index 64432982c4..34b4a846c1 100644 --- a/src/gallium/drivers/i915simple/i915_fpc_translate.c +++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c @@ -33,6 +33,8 @@ #include "i915_fpc.h" #include "pipe/p_shader_tokens.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "util/u_string.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_dump.h" diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c index 9ffa460138..d194c2fb15 100644 --- a/src/gallium/drivers/i915simple/i915_prim_emit.c +++ b/src/gallium/drivers/i915simple/i915_prim_emit.c @@ -27,7 +27,9 @@ #include "draw/draw_pipe.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_pack_color.h" #include "i915_context.h" #include "i915_winsys.h" diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c index aef3682bbf..e4ece55098 100644 --- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c +++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c @@ -41,9 +41,10 @@ #include "draw/draw_context.h" #include "draw/draw_vbuf.h" #include "pipe/p_debug.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "i915_context.h" #include "i915_reg.h" diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c index 0afa17bed8..e9e40c3f0b 100644 --- a/src/gallium/drivers/i915simple/i915_screen.c +++ b/src/gallium/drivers/i915simple/i915_screen.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_winsys.h" #include "util/u_string.h" diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c index e8521b385e..d2487d8277 100644 --- a/src/gallium/drivers/i915simple/i915_state.c +++ b/src/gallium/drivers/i915simple/i915_state.c @@ -31,8 +31,9 @@ #include "draw/draw_context.h" #include "pipe/p_winsys.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "tgsi/tgsi_parse.h" #include "i915_context.h" diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c index 4daccec6e0..488615067c 100644 --- a/src/gallium/drivers/i915simple/i915_state_derived.c +++ b/src/gallium/drivers/i915simple/i915_state_derived.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "draw/draw_context.h" #include "draw/draw_vertex.h" diff --git a/src/gallium/drivers/i915simple/i915_state_dynamic.c b/src/gallium/drivers/i915simple/i915_state_dynamic.c index 8cfbdddd19..86126a5a15 100644 --- a/src/gallium/drivers/i915simple/i915_state_dynamic.c +++ b/src/gallium/drivers/i915simple/i915_state_dynamic.c @@ -30,7 +30,9 @@ #include "i915_context.h" #include "i915_reg.h" #include "i915_state.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_pack_color.h" #define FILE_DEBUG_FLAG DEBUG_STATE diff --git a/src/gallium/drivers/i915simple/i915_state_immediate.c b/src/gallium/drivers/i915simple/i915_state_immediate.c index 2501f2d7cb..8c16bb4e27 100644 --- a/src/gallium/drivers/i915simple/i915_state_immediate.c +++ b/src/gallium/drivers/i915simple/i915_state_immediate.c @@ -33,7 +33,7 @@ #include "i915_context.h" #include "i915_state.h" #include "i915_reg.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" /* All state expressable with the LOAD_STATE_IMMEDIATE_1 packet. diff --git a/src/gallium/drivers/i915simple/i915_state_sampler.c b/src/gallium/drivers/i915simple/i915_state_sampler.c index 7868f21ca6..c09c10601b 100644 --- a/src/gallium/drivers/i915simple/i915_state_sampler.c +++ b/src/gallium/drivers/i915simple/i915_state_sampler.c @@ -27,7 +27,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "i915_state_inlines.h" #include "i915_context.h" diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c index 17b5125e56..62f1926644 100644 --- a/src/gallium/drivers/i915simple/i915_surface.c +++ b/src/gallium/drivers/i915simple/i915_surface.c @@ -30,10 +30,9 @@ #include "i915_state.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #include "util/u_rect.h" diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c index ca0fb8761b..32344da4d5 100644 --- a/src/gallium/drivers/i915simple/i915_texture.c +++ b/src/gallium/drivers/i915simple/i915_texture.c @@ -34,8 +34,9 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "pipe/p_winsys.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "i915_context.h" #include "i915_texture.h" diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c index 337e4f95f6..79d4150383 100644 --- a/src/gallium/drivers/i965simple/brw_cc.c +++ b/src/gallium/drivers/i965simple/brw_cc.c @@ -29,7 +29,8 @@ * Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "brw_context.h" #include "brw_state.h" @@ -232,8 +233,7 @@ static void upload_cc_unit( struct brw_context *brw ) cc.cc3.alpha_test_func = brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func); - UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], - brw->attribs.DepthStencil->alpha.ref); + cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref); cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8; } diff --git a/src/gallium/drivers/i965simple/brw_clip_state.c b/src/gallium/drivers/i965simple/brw_clip_state.c index ea5c05a279..8e78dd51be 100644 --- a/src/gallium/drivers/i965simple/brw_clip_state.c +++ b/src/gallium/drivers/i965simple/brw_clip_state.c @@ -32,7 +32,8 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" static void upload_clip_unit( struct brw_context *brw ) diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c index 8326f7b9c4..96920df008 100644 --- a/src/gallium/drivers/i965simple/brw_context.c +++ b/src/gallium/drivers/i965simple/brw_context.c @@ -39,7 +39,7 @@ #include "pipe/p_winsys.h" #include "pipe/p_context.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_screen.h" diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c index 52bbd525c1..824ee7fd6d 100644 --- a/src/gallium/drivers/i965simple/brw_curbe.c +++ b/src/gallium/drivers/i965simple/brw_curbe.c @@ -39,7 +39,8 @@ #include "brw_wm.h" #include "pipe/p_state.h" #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #define FILE_DEBUG_FLAG DEBUG_FALLBACKS diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c index 9c0c78c236..7c20ea52af 100644 --- a/src/gallium/drivers/i965simple/brw_draw_upload.c +++ b/src/gallium/drivers/i965simple/brw_draw_upload.c @@ -33,6 +33,7 @@ #include "brw_context.h" #include "brw_state.h" + struct brw_array_state { union header_union header; diff --git a/src/gallium/drivers/i965simple/brw_gs_state.c b/src/gallium/drivers/i965simple/brw_gs_state.c index 3932e9e939..5b8016b2e9 100644 --- a/src/gallium/drivers/i965simple/brw_gs_state.c +++ b/src/gallium/drivers/i965simple/brw_gs_state.c @@ -34,7 +34,8 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c index fadfbf94ab..ab7cd624b2 100644 --- a/src/gallium/drivers/i965simple/brw_screen.c +++ b/src/gallium/drivers/i965simple/brw_screen.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_winsys.h" #include "util/u_string.h" diff --git a/src/gallium/drivers/i965simple/brw_sf_state.c b/src/gallium/drivers/i965simple/brw_sf_state.c index 9acd3ea61b..2a5de61c21 100644 --- a/src/gallium/drivers/i965simple/brw_sf_state.c +++ b/src/gallium/drivers/i965simple/brw_sf_state.c @@ -30,11 +30,12 @@ */ - #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" + static void upload_sf_vp(struct brw_context *brw) { diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c index 30f37a99d4..86d877d7ef 100644 --- a/src/gallium/drivers/i965simple/brw_shader_info.c +++ b/src/gallium/drivers/i965simple/brw_shader_info.c @@ -1,7 +1,7 @@ #include "brw_context.h" #include "brw_state.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c index 27ca32843d..af46cb546f 100644 --- a/src/gallium/drivers/i965simple/brw_state.c +++ b/src/gallium/drivers/i965simple/brw_state.c @@ -31,7 +31,7 @@ #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_dump.h" diff --git a/src/gallium/drivers/i965simple/brw_state_batch.c b/src/gallium/drivers/i965simple/brw_state_batch.c index 35db76b594..43a1c89fc4 100644 --- a/src/gallium/drivers/i965simple/brw_state_batch.c +++ b/src/gallium/drivers/i965simple/brw_state_batch.c @@ -32,7 +32,7 @@ #include "brw_state.h" #include "brw_winsys.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" /* A facility similar to the data caching code above, which aims to * prevent identical commands being issued repeatedly. diff --git a/src/gallium/drivers/i965simple/brw_state_cache.c b/src/gallium/drivers/i965simple/brw_state_cache.c index b3a5124461..094248fa69 100644 --- a/src/gallium/drivers/i965simple/brw_state_cache.c +++ b/src/gallium/drivers/i965simple/brw_state_cache.c @@ -38,7 +38,7 @@ #include "brw_sf.h" #include "brw_gs.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c index f3174bfe0a..78d4c0e411 100644 --- a/src/gallium/drivers/i965simple/brw_state_pool.c +++ b/src/gallium/drivers/i965simple/brw_state_pool.c @@ -43,7 +43,8 @@ */ #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "brw_context.h" #include "brw_state.h" diff --git a/src/gallium/drivers/i965simple/brw_state_upload.c b/src/gallium/drivers/i965simple/brw_state_upload.c index e727601e1e..bac9161b5f 100644 --- a/src/gallium/drivers/i965simple/brw_state_upload.c +++ b/src/gallium/drivers/i965simple/brw_state_upload.c @@ -33,7 +33,7 @@ #include "brw_context.h" #include "brw_state.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" /* This is used to initialize brw->state.atoms[]. We could use this * list directly except for a single atom, brw_constant_buffer, which diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c index 69da252285..b89756c47b 100644 --- a/src/gallium/drivers/i965simple/brw_surface.c +++ b/src/gallium/drivers/i965simple/brw_surface.c @@ -29,10 +29,9 @@ #include "brw_context.h" #include "brw_state.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #include "util/u_rect.h" diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c index 9b6cf81723..05eda9d1f2 100644 --- a/src/gallium/drivers/i965simple/brw_tex_layout.c +++ b/src/gallium/drivers/i965simple/brw_tex_layout.c @@ -33,16 +33,16 @@ /* Code to layout images in a mipmap tree for i965. */ -#include "brw_tex_layout.h" - #include "pipe/p_state.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" - +#include "util/u_math.h" +#include "util/u_memory.h" #include "brw_context.h" +#include "brw_tex_layout.h" + #define FILE_DEBUG_FLAG DEBUG_TEXTURE diff --git a/src/gallium/drivers/i965simple/brw_vs_state.c b/src/gallium/drivers/i965simple/brw_vs_state.c index c73469929c..1eaff87892 100644 --- a/src/gallium/drivers/i965simple/brw_vs_state.c +++ b/src/gallium/drivers/i965simple/brw_vs_state.c @@ -34,7 +34,8 @@ #include "brw_state.h" #include "brw_defines.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" static void upload_vs_unit( struct brw_context *brw ) { diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c index 7fc5f59a98..8de565b96c 100644 --- a/src/gallium/drivers/i965simple/brw_wm.c +++ b/src/gallium/drivers/i965simple/brw_wm.c @@ -35,7 +35,7 @@ #include "brw_wm.h" #include "brw_eu.h" #include "brw_state.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c index e6f1a44817..d50e66f613 100644 --- a/src/gallium/drivers/i965simple/brw_wm_decl.c +++ b/src/gallium/drivers/i965simple/brw_wm_decl.c @@ -2,7 +2,8 @@ #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c index 6a4a5aef09..ab6410aa60 100644 --- a/src/gallium/drivers/i965simple/brw_wm_glsl.c +++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c @@ -2,7 +2,8 @@ #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c index b9eaee56ee..52b2909a65 100644 --- a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c +++ b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c @@ -34,7 +34,8 @@ #include "brw_state.h" #include "brw_defines.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #define COMPAREFUNC_ALWAYS 0 diff --git a/src/gallium/drivers/i965simple/brw_wm_state.c b/src/gallium/drivers/i965simple/brw_wm_state.c index f3aa36b07f..37a9bf919c 100644 --- a/src/gallium/drivers/i965simple/brw_wm_state.c +++ b/src/gallium/drivers/i965simple/brw_wm_state.c @@ -34,7 +34,8 @@ #include "brw_state.h" #include "brw_defines.h" #include "brw_wm.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" /*********************************************************************** * WM unit - fragment programs and rasterization diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c index 9b1313bc83..dda90f760a 100644 --- a/src/gallium/drivers/softpipe/sp_context.c +++ b/src/gallium/drivers/softpipe/sp_context.c @@ -32,8 +32,8 @@ #include "draw/draw_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "util/u_math.h" +#include "util/u_memory.h" #include "sp_clear.h" #include "sp_context.h" #include "sp_flush.h" diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c index cc171bbc39..d0456731be 100644 --- a/src/gallium/drivers/softpipe/sp_fs_exec.c +++ b/src/gallium/drivers/softpipe/sp_fs_exec.c @@ -34,7 +34,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c index 20226da78c..34adac5226 100644 --- a/src/gallium/drivers/softpipe/sp_fs_llvm.c +++ b/src/gallium/drivers/softpipe/sp_fs_llvm.c @@ -36,7 +36,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "tgsi/tgsi_sse2.h" diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c index 8b7da7c747..35653a8e48 100644 --- a/src/gallium/drivers/softpipe/sp_fs_sse.c +++ b/src/gallium/drivers/softpipe/sp_fs_sse.c @@ -34,7 +34,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_sse2.h" diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c index 941ab62e00..038ff04d4f 100644 --- a/src/gallium/drivers/softpipe/sp_prim_setup.c +++ b/src/gallium/drivers/softpipe/sp_prim_setup.c @@ -41,7 +41,7 @@ #include "sp_prim_setup.h" #include "draw/draw_pipe.h" #include "draw/draw_vertex.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" /** * Triangle setup info (derived from draw_stage). diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c index e9fae951e0..425e13cd28 100644 --- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c +++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c @@ -43,6 +43,7 @@ #include "sp_setup.h" #include "draw/draw_context.h" #include "draw/draw_vbuf.h" +#include "util/u_memory.h" #define SP_MAX_VBUF_INDEXES 1024 diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c index 7a42b08ef5..7d3580fb4f 100644 --- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c +++ b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c @@ -7,7 +7,7 @@ #include "sp_headers.h" #include "sp_quad.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" static void diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c index 74c6bff84a..a834accb86 100644 --- a/src/gallium/drivers/softpipe/sp_quad_blend.c +++ b/src/gallium/drivers/softpipe/sp_quad_blend.c @@ -31,7 +31,8 @@ */ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_headers.h" #include "sp_surface.h" @@ -128,15 +129,15 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad) /* convert to ubyte */ for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */ - UNCLAMPED_FLOAT_TO_UBYTE(dst[j][0], dest[j][0]); /* P0 */ - UNCLAMPED_FLOAT_TO_UBYTE(dst[j][1], dest[j][1]); /* P1 */ - UNCLAMPED_FLOAT_TO_UBYTE(dst[j][2], dest[j][2]); /* P2 */ - UNCLAMPED_FLOAT_TO_UBYTE(dst[j][3], dest[j][3]); /* P3 */ - - UNCLAMPED_FLOAT_TO_UBYTE(src[j][0], quadColor[j][0]); /* P0 */ - UNCLAMPED_FLOAT_TO_UBYTE(src[j][1], quadColor[j][1]); /* P1 */ - UNCLAMPED_FLOAT_TO_UBYTE(src[j][2], quadColor[j][2]); /* P2 */ - UNCLAMPED_FLOAT_TO_UBYTE(src[j][3], quadColor[j][3]); /* P3 */ + dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */ + dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */ + dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */ + dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */ + + src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */ + src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */ + src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */ + src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */ } switch (softpipe->blend->logicop_func) { @@ -209,10 +210,10 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad) } for (j = 0; j < 4; j++) { - quadColor[j][0] = UBYTE_TO_FLOAT(res[j][0]); - quadColor[j][1] = UBYTE_TO_FLOAT(res[j][1]); - quadColor[j][2] = UBYTE_TO_FLOAT(res[j][2]); - quadColor[j][3] = UBYTE_TO_FLOAT(res[j][3]); + quadColor[j][0] = ubyte_to_float(res[j][0]); + quadColor[j][1] = ubyte_to_float(res[j][1]); + quadColor[j][2] = ubyte_to_float(res[j][2]); + quadColor[j][3] = ubyte_to_float(res[j][3]); } } diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c index b3db428ef1..92e9af09c1 100644 --- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c +++ b/src/gallium/drivers/softpipe/sp_quad_bufloop.c @@ -1,5 +1,5 @@ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_headers.h" #include "sp_surface.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c index 7fe080990b..f72f31db97 100644 --- a/src/gallium/drivers/softpipe/sp_quad_colormask.c +++ b/src/gallium/drivers/softpipe/sp_quad_colormask.c @@ -31,7 +31,8 @@ */ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_headers.h" #include "sp_surface.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c index dd5ebb2296..ad907ec25f 100644 --- a/src/gallium/drivers/softpipe/sp_quad_coverage.c +++ b/src/gallium/drivers/softpipe/sp_quad_coverage.c @@ -33,7 +33,7 @@ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_headers.h" #include "sp_quad.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c index 0c82692c6e..227cb2014e 100644 --- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c +++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c @@ -30,7 +30,7 @@ */ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_headers.h" #include "sp_surface.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c index 22ea99049f..5a66a86699 100644 --- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c +++ b/src/gallium/drivers/softpipe/sp_quad_earlyz.c @@ -30,7 +30,7 @@ */ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_headers.h" #include "sp_quad.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c index 8c88c192f8..5499ba5361 100644 --- a/src/gallium/drivers/softpipe/sp_quad_fs.c +++ b/src/gallium/drivers/softpipe/sp_quad_fs.c @@ -35,7 +35,8 @@ * all the enabled attributes run contiguously. */ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c index 54254df1f1..db13e73ae3 100644 --- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c +++ b/src/gallium/drivers/softpipe/sp_quad_occlusion.c @@ -33,7 +33,7 @@ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_headers.h" #include "sp_surface.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c index 40083138a4..b64646a449 100644 --- a/src/gallium/drivers/softpipe/sp_quad_output.c +++ b/src/gallium/drivers/softpipe/sp_quad_output.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_headers.h" #include "sp_surface.h" diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c index b4c7e942fa..ce9562e07c 100644 --- a/src/gallium/drivers/softpipe/sp_quad_stencil.c +++ b/src/gallium/drivers/softpipe/sp_quad_stencil.c @@ -10,7 +10,7 @@ #include "sp_tile_cache.h" #include "sp_quad.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" /** Only 8-bit stencil supported */ diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c index f1e9b80e09..a39ecc2e9d 100644 --- a/src/gallium/drivers/softpipe/sp_quad_stipple.c +++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c @@ -7,7 +7,7 @@ #include "sp_headers.h" #include "sp_quad.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" /** diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c index adf9ccf64c..2106ee1d23 100644 --- a/src/gallium/drivers/softpipe/sp_query.c +++ b/src/gallium/drivers/softpipe/sp_query.c @@ -32,7 +32,7 @@ #include "draw/draw_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_query.h" diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index f6b3d7ac24..9644dbd168 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -26,7 +26,7 @@ **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_winsys.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c index c8c55fa6e8..87336ab6e3 100644 --- a/src/gallium/drivers/softpipe/sp_setup.c +++ b/src/gallium/drivers/softpipe/sp_setup.c @@ -42,9 +42,9 @@ #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_vertex.h" -#include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#include "util/u_memory.h" #define DEBUG_VERTS 0 diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c index 2d40d6bd8f..384fe559af 100644 --- a/src/gallium/drivers/softpipe/sp_state_blend.c +++ b/src/gallium/drivers/softpipe/sp_state_blend.c @@ -28,7 +28,7 @@ /* Authors: Keith Whitwell */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_state.h" diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c index f10a1fa471..6b6a4c3ff3 100644 --- a/src/gallium/drivers/softpipe/sp_state_derived.c +++ b/src/gallium/drivers/softpipe/sp_state_derived.c @@ -25,7 +25,8 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "draw/draw_context.h" #include "draw/draw_vertex.h" diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c index 76fe6bfef9..1be461b3a4 100644 --- a/src/gallium/drivers/softpipe/sp_state_fs.c +++ b/src/gallium/drivers/softpipe/sp_state_fs.c @@ -30,7 +30,7 @@ #include "sp_fs.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" #include "pipe/p_shader_tokens.h" diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c index 98e04352db..87b7219683 100644 --- a/src/gallium/drivers/softpipe/sp_state_rasterizer.c +++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c @@ -26,7 +26,7 @@ **************************************************************************/ #include "pipe/p_defines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_state.h" #include "draw/draw_context.h" diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c index 033288a0aa..99a28c0d7e 100644 --- a/src/gallium/drivers/softpipe/sp_state_sampler.c +++ b/src/gallium/drivers/softpipe/sp_state_sampler.c @@ -29,7 +29,7 @@ * Brian Paul */ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "draw/draw_context.h" diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c index bfbae234f1..389aceb27c 100644 --- a/src/gallium/drivers/softpipe/sp_surface.c +++ b/src/gallium/drivers/softpipe/sp_surface.c @@ -26,10 +26,9 @@ **************************************************************************/ #include "pipe/p_defines.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "pipe/p_winsys.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #include "util/u_rect.h" #include "sp_context.h" #include "sp_surface.h" diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c index 58a95d13e1..49250ec084 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.c +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c @@ -39,9 +39,9 @@ #include "sp_tile_cache.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" #include "tgsi/tgsi_exec.h" #include "util/u_math.h" +#include "util/u_memory.h" /* diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c index f775591352..3a737d6f72 100644 --- a/src/gallium/drivers/softpipe/sp_texture.c +++ b/src/gallium/drivers/softpipe/sp_texture.c @@ -33,8 +33,9 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "pipe/p_winsys.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "sp_context.h" #include "sp_state.h" diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c index 57c12ffe33..b50c984513 100644 --- a/src/gallium/drivers/softpipe/sp_tile_cache.c +++ b/src/gallium/drivers/softpipe/sp_tile_cache.c @@ -32,9 +32,9 @@ * Brian Paul */ -#include "pipe/p_util.h" #include "pipe/p_inlines.h" -#include "util/p_tile.h" +#include "util/u_memory.h" +#include "util/u_tile.h" #include "sp_context.h" #include "sp_surface.h" #include "sp_texture.h" diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c index f16359e8ad..1dd7719379 100644 --- a/src/gallium/drivers/trace/tr_context.c +++ b/src/gallium/drivers/trace/tr_context.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_screen.h" #include "tr_dump.h" diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c index 1613a626df..48032c1617 100644 --- a/src/gallium/drivers/trace/tr_dump.c +++ b/src/gallium/drivers/trace/tr_dump.c @@ -45,6 +45,8 @@ #endif #include "pipe/p_compiler.h" +#include "pipe/p_debug.h" +#include "util/u_memory.h" #include "util/u_string.h" #include "tr_stream.h" diff --git a/src/gallium/drivers/trace/tr_dump.h b/src/gallium/drivers/trace/tr_dump.h index 6ddc8fc15c..76a53731b3 100644 --- a/src/gallium/drivers/trace/tr_dump.h +++ b/src/gallium/drivers/trace/tr_dump.h @@ -35,7 +35,6 @@ #include "pipe/p_compiler.h" -#include "pipe/p_util.h" boolean trace_dump_trace_begin(void); diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c index a6467ec35f..8789f86b1a 100644 --- a/src/gallium/drivers/trace/tr_screen.c +++ b/src/gallium/drivers/trace/tr_screen.c @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "tr_dump.h" #include "tr_state.h" diff --git a/src/gallium/drivers/trace/tr_state.c b/src/gallium/drivers/trace/tr_state.c index 30ab5a8fdc..986d939e0c 100644 --- a/src/gallium/drivers/trace/tr_state.c +++ b/src/gallium/drivers/trace/tr_state.c @@ -27,6 +27,7 @@ #include "pipe/p_compiler.h" +#include "util/u_memory.h" #include "tgsi/tgsi_dump.h" #include "tr_dump.h" diff --git a/src/gallium/drivers/trace/tr_stream_stdc.c b/src/gallium/drivers/trace/tr_stream_stdc.c index 4c77e1c995..4c19ec0b24 100644 --- a/src/gallium/drivers/trace/tr_stream_stdc.c +++ b/src/gallium/drivers/trace/tr_stream_stdc.c @@ -36,7 +36,7 @@ #include -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "tr_stream.h" diff --git a/src/gallium/drivers/trace/tr_stream_wd.c b/src/gallium/drivers/trace/tr_stream_wd.c index b3b65f0971..704eb15bd7 100644 --- a/src/gallium/drivers/trace/tr_stream_wd.c +++ b/src/gallium/drivers/trace/tr_stream_wd.c @@ -37,7 +37,7 @@ #include #include -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "util/u_string.h" #include "tr_stream.h" diff --git a/src/gallium/drivers/trace/tr_texture.c b/src/gallium/drivers/trace/tr_texture.c index 99ba74d366..440a78704a 100644 --- a/src/gallium/drivers/trace/tr_texture.c +++ b/src/gallium/drivers/trace/tr_texture.c @@ -25,9 +25,9 @@ * **************************************************************************/ -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "util/u_hash_table.h" +#include "util/u_memory.h" #include "tr_screen.h" #include "tr_texture.h" diff --git a/src/gallium/drivers/trace/tr_winsys.c b/src/gallium/drivers/trace/tr_winsys.c index 2c7a6f893b..177835854e 100644 --- a/src/gallium/drivers/trace/tr_winsys.c +++ b/src/gallium/drivers/trace/tr_winsys.c @@ -25,8 +25,7 @@ * **************************************************************************/ -#include "pipe/p_util.h" -#include "pipe/p_state.h" +#include "util/u_memory.h" #include "util/u_hash_table.h" #include "tr_dump.h" diff --git a/src/gallium/include/pipe/p_util.h b/src/gallium/include/pipe/p_util.h deleted file mode 100644 index 4a3fca5962..0000000000 --- a/src/gallium/include/pipe/p_util.h +++ /dev/null @@ -1,460 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#ifndef P_UTIL_H -#define P_UTIL_H - -#include "p_config.h" -#include "p_compiler.h" -#include "p_debug.h" -#include "p_pointer.h" - -#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) -__inline double ceil(double val) -{ - double ceil_val; - - if((val - (long) val) == 0) { - ceil_val = val; - } else { - if(val > 0) { - ceil_val = (long) val + 1; - } else { - ceil_val = (long) val; - } - } - - return ceil_val; -} - -#ifndef PIPE_SUBSYSTEM_WINDOWS_CE -__inline double floor(double val) -{ - double floor_val; - - if((val - (long) val) == 0) { - floor_val = val; - } else { - if(val > 0) { - floor_val = (long) val; - } else { - floor_val = (long) val - 1; - } - } - - return floor_val; -} -#endif - -#pragma function(pow) -__inline double __cdecl pow(double val, double exponent) -{ - /* XXX */ - assert(0); - return 0; -} - -#pragma function(log) -__inline double __cdecl log(double val) -{ - /* XXX */ - assert(0); - return 0; -} - -#pragma function(atan2) -__inline double __cdecl atan2(double val) -{ - /* XXX */ - assert(0); - return 0; -} -#else -#include -#include -#endif - - /* Define ENOMEM for WINCE */ -#if (_WIN32_WCE < 600) -#ifndef ENOMEM -#define ENOMEM 12 -#endif -#endif - -#ifdef __cplusplus -extern "C" { -#endif - - -#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) - -/* memory debugging */ - -#include "p_debug.h" - -#define MALLOC( _size ) \ - debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size ) -#define CALLOC( _count, _size ) \ - debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size ) -#define FREE( _ptr ) \ - debug_free( __FILE__, __LINE__, __FUNCTION__, _ptr ) -#define REALLOC( _ptr, _old_size, _size ) \ - debug_realloc( __FILE__, __LINE__, __FUNCTION__, _ptr, _old_size, _size ) - -#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) - -void * __stdcall -EngAllocMem( - unsigned long Flags, - unsigned long MemSize, - unsigned long Tag ); - -void __stdcall -EngFreeMem( - void *Mem ); - -#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' ) -#define _FREE( _ptr ) EngFreeMem( _ptr ) - -#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) - -void * -ExAllocatePool( - unsigned long PoolType, - size_t NumberOfBytes); - -void -ExFreePool(void *P); - -#define MALLOC(_size) ExAllocatePool(0, _size) -#define _FREE(_ptr) ExFreePool(_ptr) - -#else - -#define MALLOC( SIZE ) malloc( SIZE ) -#define CALLOC( COUNT, SIZE ) calloc( COUNT, SIZE ) -#define FREE( PTR ) free( PTR ) -#define REALLOC( OLDPTR, OLDSIZE, NEWSIZE ) realloc( OLDPTR, NEWSIZE ) - -#endif - - -#ifndef CALLOC -static INLINE void * -CALLOC( unsigned count, unsigned size ) -{ - void *ptr = MALLOC( count * size ); - if( ptr ) { - memset( ptr, 0, count * size ); - } - return ptr; -} -#endif /* !CALLOC */ - -#ifndef FREE -static INLINE void -FREE( void *ptr ) -{ - if( ptr ) { - _FREE( ptr ); - } -} -#endif /* !FREE */ - -#ifndef REALLOC -static INLINE void * -REALLOC( void *old_ptr, unsigned old_size, unsigned new_size ) -{ - void *new_ptr = NULL; - - if (new_size != 0) { - unsigned copy_size = old_size < new_size ? old_size : new_size; - new_ptr = MALLOC( new_size ); - if (new_ptr && old_ptr && copy_size) { - memcpy( new_ptr, old_ptr, copy_size ); - } - } - - FREE( old_ptr ); - return new_ptr; -} -#endif /* !REALLOC */ - - -#define MALLOC_STRUCT(T) (struct T *) MALLOC(sizeof(struct T)) - -#define CALLOC_STRUCT(T) (struct T *) CALLOC(1, sizeof(struct T)) - - -/** - * Return memory on given byte alignment - */ -static INLINE void * -align_malloc(size_t bytes, uint alignment) -{ -#if defined(HAVE_POSIX_MEMALIGN) - void *mem; - alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1); - if(posix_memalign(& mem, alignment, bytes) != 0) - return NULL; - return mem; -#else - char *ptr, *buf; - - assert( alignment > 0 ); - - ptr = (char *) MALLOC(bytes + alignment + sizeof(void *)); - if (!ptr) - return NULL; - - buf = (char *) align_pointer( ptr + sizeof(void *), alignment ); - *(char **)(buf - sizeof(void *)) = ptr; - - return buf; -#endif /* defined(HAVE_POSIX_MEMALIGN) */ -} - -/** - * Free memory returned by align_malloc(). - */ -static INLINE void -align_free(void *ptr) -{ -#if defined(HAVE_POSIX_MEMALIGN) - FREE(ptr); -#else - void **cubbyHole = (void **) ((char *) ptr - sizeof(void *)); - void *realAddr = *cubbyHole; - FREE(realAddr); -#endif /* defined(HAVE_POSIX_MEMALIGN) */ -} - - - -/** - * Duplicate a block of memory. - */ -static INLINE void * -mem_dup(const void *src, uint size) -{ - void *dup = MALLOC(size); - if (dup) - memcpy(dup, src, size); - return dup; -} - - - -#define CLAMP( X, MIN, MAX ) ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) ) -#define MIN2( A, B ) ( (A)<(B) ? (A) : (B) ) -#define MAX2( A, B ) ( (A)>(B) ? (A) : (B) ) - -#ifndef Elements -#define Elements(x) (sizeof(x)/sizeof((x)[0])) -#endif -#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER)) - -/** - * Return a pointer aligned to next multiple of 16 bytes. - */ -static INLINE void * -align16( void *unaligned ) -{ - return align_pointer( unaligned, 16 ); -} - - -static INLINE int align(int value, int alignment) -{ - return (value + alignment - 1) & ~(alignment - 1); -} - - - - -#if defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86) -static INLINE unsigned ffs( unsigned u ) -{ - unsigned i; - - if( u == 0 ) { - return 0; - } - - __asm bsf eax, [u] - __asm inc eax - __asm mov [i], eax - - return i; -} -#endif - -union fi { - float f; - int i; - unsigned ui; -}; - -#define UBYTE_TO_FLOAT( ub ) ((float)(ub) / 255.0F) - -#define IEEE_0996 0x3f7f0000 /* 0.996 or so */ - -/* This function/macro is sensitive to precision. Test very carefully - * if you change it! - */ -#define UNCLAMPED_FLOAT_TO_UBYTE(UB, F) \ - do { \ - union fi __tmp; \ - __tmp.f = (F); \ - if (__tmp.i < 0) \ - UB = (ubyte) 0; \ - else if (__tmp.i >= IEEE_0996) \ - UB = (ubyte) 255; \ - else { \ - __tmp.f = __tmp.f * (255.0f/256.0f) + 32768.0f; \ - UB = (ubyte) __tmp.i; \ - } \ - } while (0) - - - -static INLINE unsigned pack_ub4( unsigned char b0, - unsigned char b1, - unsigned char b2, - unsigned char b3 ) -{ - return ((((unsigned int)b0) << 0) | - (((unsigned int)b1) << 8) | - (((unsigned int)b2) << 16) | - (((unsigned int)b3) << 24)); -} - -static INLINE unsigned fui( float f ) -{ - union fi fi; - fi.f = f; - return fi.ui; -} - -static INLINE unsigned char float_to_ubyte( float f ) -{ - unsigned char ub; - UNCLAMPED_FLOAT_TO_UBYTE(ub, f); - return ub; -} - -static INLINE unsigned pack_ui32_float4( float a, - float b, - float c, - float d ) -{ - return pack_ub4( float_to_ubyte(a), - float_to_ubyte(b), - float_to_ubyte(c), - float_to_ubyte(d) ); -} - -#define COPY_4V( DST, SRC ) \ -do { \ - (DST)[0] = (SRC)[0]; \ - (DST)[1] = (SRC)[1]; \ - (DST)[2] = (SRC)[2]; \ - (DST)[3] = (SRC)[3]; \ -} while (0) - - -#define COPY_4FV( DST, SRC ) COPY_4V(DST, SRC) - - -#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \ -do { \ - (DST)[0] = (V0); \ - (DST)[1] = (V1); \ - (DST)[2] = (V2); \ - (DST)[3] = (V3); \ -} while (0) - - - -#if defined(_MSC_VER) -#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE) - -static INLINE float cosf( float f ) -{ - return (float) cos( (double) f ); -} - -static INLINE float sinf( float f ) -{ - return (float) sin( (double) f ); -} - -static INLINE float ceilf( float f ) -{ - return (float) ceil( (double) f ); -} - -static INLINE float floorf( float f ) -{ - return (float) floor( (double) f ); -} - -static INLINE float powf( float f, float g ) -{ - return (float) pow( (double) f, (double) g ); -} - -static INLINE float sqrtf( float f ) -{ - return (float) sqrt( (double) f ); -} - -static INLINE float fabsf( float f ) -{ - return (float) fabs( (double) f ); -} - -static INLINE float logf( float f ) -{ - return (float) log( (double) f ); -} - -#else -/* Work-around an extra semi-colon in VS 2005 logf definition */ -#ifdef logf -#undef logf -#define logf(x) ((float)log((double)(x))) -#endif /* logf */ -#endif -#endif /* _MSC_VER */ - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/gallium/state_trackers/python/gallium.i b/src/gallium/state_trackers/python/gallium.i index 641b19e940..a67372c623 100644 --- a/src/gallium/state_trackers/python/gallium.i +++ b/src/gallium/state_trackers/python/gallium.i @@ -42,7 +42,7 @@ #include "pipe/p_screen.h" #include "pipe/p_context.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "cso_cache/cso_context.h" #include "util/u_draw_quad.h" diff --git a/src/gallium/state_trackers/python/st_device.c b/src/gallium/state_trackers/python/st_device.c index a1889539dc..f71d85dd9b 100644 --- a/src/gallium/state_trackers/python/st_device.c +++ b/src/gallium/state_trackers/python/st_device.c @@ -26,12 +26,13 @@ **************************************************************************/ -#include "pipe/p_util.h" #include "pipe/p_winsys.h" #include "pipe/p_context.h" #include "pipe/p_shader_tokens.h" #include "pipe/p_inlines.h" #include "cso_cache/cso_context.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "util/u_simple_shaders.h" #include "trace/tr_screen.h" #include "trace/tr_context.h" diff --git a/src/gallium/state_trackers/python/st_sample.c b/src/gallium/state_trackers/python/st_sample.c index b47c7be293..7765df3c4a 100644 --- a/src/gallium/state_trackers/python/st_sample.c +++ b/src/gallium/state_trackers/python/st_sample.c @@ -29,9 +29,10 @@ #include "pipe/p_compiler.h" #include "pipe/p_format.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" -#include "util/p_tile.h" +#include "util/u_tile.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "st_sample.h" diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c index 6ea3c9a5cf..2d4f5434b3 100644 --- a/src/gallium/state_trackers/python/st_softpipe_winsys.c +++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c @@ -39,8 +39,9 @@ #include "pipe/p_winsys.h" #include "pipe/p_format.h" #include "pipe/p_context.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "softpipe/sp_winsys.h" #include "st_winsys.h" diff --git a/src/gallium/winsys/drm/intel/common/intel_be_device.c b/src/gallium/winsys/drm/intel/common/intel_be_device.c index 8db0329615..019ee5cbd2 100644 --- a/src/gallium/winsys/drm/intel/common/intel_be_device.c +++ b/src/gallium/winsys/drm/intel/common/intel_be_device.c @@ -13,8 +13,8 @@ #include "pipe/p_winsys.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" +#include "util/u_memory.h" #include "i915simple/i915_screen.h" diff --git a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c index 0d98d16cf1..20920a2052 100644 --- a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c +++ b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c @@ -32,8 +32,8 @@ #include "intel_context.h" #include "intel_winsys_softpipe.h" #include "pipe/p_defines.h" -#include "pipe/p_util.h" #include "pipe/p_format.h" +#include "util/u_memory.h" #include "softpipe/sp_winsys.h" diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c index 829732eea8..e9f821d276 100644 --- a/src/gallium/winsys/egl_xlib/egl_xlib.c +++ b/src/gallium/winsys/egl_xlib/egl_xlib.c @@ -38,8 +38,8 @@ #include "pipe/p_compiler.h" #include "pipe/p_format.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" #include "pipe/p_winsys.h" +#include "util/u_memory.h" #include "softpipe/sp_winsys.h" #include "eglconfig.h" diff --git a/src/gallium/winsys/egl_xlib/sw_winsys.c b/src/gallium/winsys/egl_xlib/sw_winsys.c index f4199e6f89..ae81d7f801 100644 --- a/src/gallium/winsys/egl_xlib/sw_winsys.c +++ b/src/gallium/winsys/egl_xlib/sw_winsys.c @@ -37,8 +37,9 @@ #include "pipe/p_winsys.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "sw_winsys.h" diff --git a/src/gallium/winsys/gdi/wmesa.c b/src/gallium/winsys/gdi/wmesa.c index ff52ceb8c4..730fb1b541 100644 --- a/src/gallium/winsys/gdi/wmesa.c +++ b/src/gallium/winsys/gdi/wmesa.c @@ -12,8 +12,8 @@ #include "pipe/p_winsys.h" #include "pipe/p_format.h" #include "pipe/p_context.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" +#include "util/u_memory.h" #include "softpipe/sp_winsys.h" #include "glapi/glapi.h" #include "colors.h" diff --git a/src/gallium/winsys/xlib/brw_aub.c b/src/gallium/winsys/xlib/brw_aub.c index 6e814ce5d1..f319802962 100644 --- a/src/gallium/winsys/xlib/brw_aub.c +++ b/src/gallium/winsys/xlib/brw_aub.c @@ -34,7 +34,6 @@ #include "brw_aub.h" #include "pipe/p_context.h" #include "pipe/p_state.h" -#include "pipe/p_util.h" #include "pipe/p_debug.h" diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c index 4b4dc56e84..68ead7f528 100644 --- a/src/gallium/winsys/xlib/xm_winsys.c +++ b/src/gallium/winsys/xlib/xm_winsys.c @@ -42,8 +42,9 @@ #include "pipe/p_winsys.h" #include "pipe/p_format.h" #include "pipe/p_context.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "softpipe/sp_winsys.h" #ifdef GALLIUM_CELL diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.c b/src/gallium/winsys/xlib/xm_winsys_aub.c index 7fc9debdd5..3439367636 100644 --- a/src/gallium/winsys/xlib/xm_winsys_aub.c +++ b/src/gallium/winsys/xlib/xm_winsys_aub.c @@ -37,7 +37,7 @@ #include "xmesaP.h" #include "pipe/p_winsys.h" -#include "pipe/p_util.h" +#include "util/u_memory.h" #include "pipe/p_inlines.h" #include "i965simple/brw_winsys.h" #include "i965simple/brw_screen.h" diff --git a/src/mesa/state_tracker/acc2.c b/src/mesa/state_tracker/acc2.c new file mode 100644 index 0000000000..fa5de2b764 --- /dev/null +++ b/src/mesa/state_tracker/acc2.c @@ -0,0 +1,319 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Brian Paul + */ + +#include "main/imports.h" +#include "main/image.h" +#include "main/macros.h" + +#include "st_context.h" +#include "st_cb_accum.h" +#include "st_cb_fbo.h" +#include "st_draw.h" +#include "st_format.h" +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "pipe/p_inlines.h" +#include "util/p_tile.h" + + +#define UNCLAMPED_FLOAT_TO_SHORT(us, f) \ + us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) ) + + +/** + * For hardware that supports deep color buffers, we could accelerate + * most/all the accum operations with blending/texturing. + * For now, just use the get/put_tile() functions and do things in software. + */ + + +static void +acc_get_tile_rgba(struct pipe_context *pipe, struct pipe_surface *acc_ps, + uint x, uint y, uint w, uint h, float *p) +{ + const enum pipe_format f = acc_ps->format; + const int cpp = acc_ps->cpp; + + acc_ps->format = PIPE_FORMAT_R16G16B16A16_SNORM; + acc_ps->cpp = 8; + + pipe_get_tile_rgba(pipe, acc_ps, x, y, w, h, p); + + acc_ps->format = f; + acc_ps->cpp = cpp; +} + + +static void +acc_put_tile_rgba(struct pipe_context *pipe, struct pipe_surface *acc_ps, + uint x, uint y, uint w, uint h, const float *p) +{ + enum pipe_format f = acc_ps->format; + const int cpp = acc_ps->cpp; + + acc_ps->format = PIPE_FORMAT_R16G16B16A16_SNORM; + acc_ps->cpp = 8; + + pipe_put_tile_rgba(pipe, acc_ps, x, y, w, h, p); + + acc_ps->format = f; + acc_ps->cpp = cpp; +} + + + +void +st_clear_accum_buffer(GLcontext *ctx, struct gl_renderbuffer *rb) +{ + struct pipe_context *pipe = ctx->st->pipe; + struct st_renderbuffer *acc_strb = st_renderbuffer(rb); + struct pipe_surface *acc_ps = acc_strb->surface; + const GLint xpos = ctx->DrawBuffer->_Xmin; + const GLint ypos = ctx->DrawBuffer->_Ymin; + const GLint width = ctx->DrawBuffer->_Xmax - xpos; + const GLint height = ctx->DrawBuffer->_Ymax - ypos; + const GLfloat r = ctx->Accum.ClearColor[0]; + const GLfloat g = ctx->Accum.ClearColor[1]; + const GLfloat b = ctx->Accum.ClearColor[2]; + const GLfloat a = ctx->Accum.ClearColor[3]; + GLfloat *accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat)); + int i; + +#if 1 + GLvoid *map; + + map = pipe_surface_map(acc_ps); + switch (acc_strb->format) { + case PIPE_FORMAT_R16G16B16A16_SNORM: + { + GLshort r = FLOAT_TO_SHORT(ctx->Accum.ClearColor[0]); + GLshort g = FLOAT_TO_SHORT(ctx->Accum.ClearColor[1]); + GLshort b = FLOAT_TO_SHORT(ctx->Accum.ClearColor[2]); + GLshort a = FLOAT_TO_SHORT(ctx->Accum.ClearColor[3]); + int i, j; + for (i = 0; i < height; i++) { + GLshort *dst = ((GLshort *) map + + ((ypos + i) * acc_ps->pitch + xpos) * 4); + for (j = 0; j < width; j++) { + dst[0] = r; + dst[1] = g; + dst[2] = b; + dst[3] = a; + dst += 4; + } + } + } + break; + default: + _mesa_problem(ctx, "unexpected format in st_clear_accum_buffer()"); + } + + pipe_surface_unmap(acc_ps); + +#else + for (i = 0; i < width * height; i++) { + accBuf[i*4+0] = r; + accBuf[i*4+1] = g; + accBuf[i*4+2] = b; + accBuf[i*4+3] = a; + } + + acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf); +#endif +} + + +/** For ADD/MULT */ +static void +accum_mad(struct pipe_context *pipe, GLfloat scale, GLfloat bias, + GLint xpos, GLint ypos, GLint width, GLint height, + struct pipe_surface *acc_ps) +{ + GLfloat *accBuf; + GLint i; + + accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat)); + + pipe_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf); + + for (i = 0; i < 4 * width * height; i++) { + accBuf[i] = accBuf[i] * scale + bias; + } + + pipe_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf); + + free(accBuf); +} + + +static void +accum_accum(struct pipe_context *pipe, GLfloat value, + GLint xpos, GLint ypos, GLint width, GLint height, + struct pipe_surface *acc_ps, + struct pipe_surface *color_ps) +{ + GLfloat *colorBuf, *accBuf; + GLint i; + + colorBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat)); + accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat)); + + pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, colorBuf); + acc_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf); + + for (i = 0; i < 4 * width * height; i++) { + accBuf[i] = accBuf[i] + colorBuf[i] * value; + } + + acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf); + + free(colorBuf); + free(accBuf); +} + + +static void +accum_load(struct pipe_context *pipe, GLfloat value, + GLint xpos, GLint ypos, GLint width, GLint height, + struct pipe_surface *acc_ps, + struct pipe_surface *color_ps) +{ + GLfloat *buf; + GLint i; + + buf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat)); + + pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, buf); + + for (i = 0; i < 4 * width * height; i++) { + buf[i] = buf[i] * value; + } + + acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, buf); + + free(buf); +} + + +static void +accum_return(GLcontext *ctx, GLfloat value, + GLint xpos, GLint ypos, GLint width, GLint height, + struct pipe_surface *acc_ps, + struct pipe_surface *color_ps) +{ + struct pipe_context *pipe = ctx->st->pipe; + const GLubyte *colormask = ctx->Color.ColorMask; + GLfloat *abuf, *cbuf = NULL; + GLint i, ch; + + abuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat)); + + acc_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, abuf); + + if (!colormask[0] || !colormask[1] || !colormask[2] || !colormask[3]) { + cbuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat)); + pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, cbuf); + } + + for (i = 0; i < width * height; i++) { + for (ch = 0; ch < 4; ch++) { + if (colormask[ch]) { + GLfloat val = abuf[i * 4 + ch] * value; + abuf[i * 4 + ch] = CLAMP(val, 0.0, 1.0); + } + else { + abuf[i * 4 + ch] = cbuf[i * 4 + ch]; + } + } + } + + pipe_put_tile_rgba(pipe, color_ps, xpos, ypos, width, height, abuf); + + free(abuf); + if (cbuf) + free(cbuf); +} + + +static void +st_Accum(GLcontext *ctx, GLenum op, GLfloat value) +{ + struct st_context *st = ctx->st; + struct pipe_context *pipe = st->pipe; + struct st_renderbuffer *acc_strb + = st_renderbuffer(ctx->DrawBuffer->Attachment[BUFFER_ACCUM].Renderbuffer); + struct st_renderbuffer *color_strb + = st_renderbuffer(ctx->ReadBuffer->_ColorReadBuffer); + struct pipe_surface *acc_ps = acc_strb->surface; + struct pipe_surface *color_ps = color_strb->surface; + + const GLint xpos = ctx->DrawBuffer->_Xmin; + const GLint ypos = ctx->DrawBuffer->_Ymin; + const GLint width = ctx->DrawBuffer->_Xmax - xpos; + const GLint height = ctx->DrawBuffer->_Ymax - ypos; + + /* make sure color bufs aren't cached */ + pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL); + + switch (op) { + case GL_ADD: + if (value != 0.0F) { + accum_mad(pipe, 1.0, value, xpos, ypos, width, height, acc_ps); + } + break; + case GL_MULT: + if (value != 1.0F) { + accum_mad(pipe, value, 0.0, xpos, ypos, width, height, acc_ps); + } + break; + case GL_ACCUM: + if (value != 0.0F) { + accum_accum(pipe, value, xpos, ypos, width, height, acc_ps, color_ps); + } + break; + case GL_LOAD: + accum_load(pipe, value, xpos, ypos, width, height, acc_ps, color_ps); + break; + case GL_RETURN: + accum_return(ctx, value, xpos, ypos, width, height, acc_ps, color_ps); + break; + default: + assert(0); + } +} + + + +void st_init_accum_functions(struct dd_function_table *functions) +{ + functions->Accum = st_Accum; +} diff --git a/src/mesa/state_tracker/st_cb_accum.c b/src/mesa/state_tracker/st_cb_accum.c index a992e08ff6..cf3a99e7e9 100644 --- a/src/mesa/state_tracker/st_cb_accum.c +++ b/src/mesa/state_tracker/st_cb_accum.c @@ -42,7 +42,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #define UNCLAMPED_FLOAT_TO_SHORT(us, f) \ diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c index d5696a909f..a0c305d66f 100644 --- a/src/mesa/state_tracker/st_cb_bitmap.c +++ b/src/mesa/state_tracker/st_cb_bitmap.c @@ -50,7 +50,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #include "util/u_draw_quad.h" #include "util/u_simple_shaders.h" #include "shader/prog_instruction.h" diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index 0c5e21d4ff..4ec7c752df 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -55,7 +55,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #include "util/u_draw_quad.h" #include "shader/prog_instruction.h" #include "cso_cache/cso_context.h" diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c index 39f5856f94..c801532788 100644 --- a/src/mesa/state_tracker/st_cb_readpixels.c +++ b/src/mesa/state_tracker/st_cb_readpixels.c @@ -41,7 +41,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #include "st_context.h" #include "st_cb_bitmap.h" #include "st_cb_readpixels.h" diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c index 6177ac63f0..16bbf3d80f 100644 --- a/src/mesa/state_tracker/st_cb_texture.c +++ b/src/mesa/state_tracker/st_cb_texture.c @@ -51,7 +51,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "util/p_tile.h" +#include "util/u_tile.h" #include "util/u_blit.h" diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 325d95e865..936a6e32ea 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -55,7 +55,7 @@ #define TGSI_DEBUG 0 -/** XXX we should use the version of this from p_util.h but including +/** XXX we should use the version of this from u_memory.h but including * that header causes symbol collisions. */ static INLINE void * diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c index 63046a0ecc..73cebff33f 100644 --- a/src/mesa/state_tracker/st_texture.c +++ b/src/mesa/state_tracker/st_texture.c @@ -36,7 +36,6 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_inlines.h" -#include "pipe/p_util.h" #include "pipe/p_inlines.h" #include "util/u_rect.h" -- cgit v1.2.3 From 5cf2e226548f08c4b79a4eb289fd636a00079fb3 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 4 Sep 2008 18:36:22 -0600 Subject: cell: implement CELL_DEBUG env/options var Options so far: "checker" module tile clear color by SPU ID to see where the tiles are "sync" to do synchronous DMA (only partially implemented) --- src/gallium/drivers/cell/common.h | 4 ++ src/gallium/drivers/cell/ppu/cell_context.c | 13 ++++++ src/gallium/drivers/cell/ppu/cell_context.h | 2 + src/gallium/drivers/cell/ppu/cell_spu.c | 1 + src/gallium/drivers/cell/spu/spu_main.c | 71 +++++++++++++++++++---------- 5 files changed, 66 insertions(+), 25 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 6bace0bb11..c0ca201e1d 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -106,6 +106,9 @@ #define CELL_BUFFER_STATUS_USED 20 +#define CELL_DEBUG_CHECKER (1 << 0) +#define CELL_DEBUG_SYNC (1 << 1) + /** */ @@ -263,6 +266,7 @@ struct cell_init_info { unsigned id; unsigned num_spus; + unsigned debug_flags; /**< mask of CELL_DEBUG_x flags */ struct cell_command *cmd; /** Buffers for command batches, vertex/index data */ diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 9ff4e86943..c8828e644c 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -85,6 +85,15 @@ cell_draw_create(struct cell_context *cell) } +#ifdef DEBUG +static const struct debug_named_value cell_debug_flags[] = { + {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */ + {"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */ + {NULL, 0} +}; +#endif + + struct pipe_context * cell_create_context(struct pipe_screen *screen, struct cell_winsys *cws) @@ -136,6 +145,10 @@ cell_create_context(struct pipe_screen *screen, draw_wide_point_threshold(cell->draw, 0.0); draw_wide_line_threshold(cell->draw, 0.0); + cell->debug_flags = debug_get_flags_option("CELL_DEBUG", + cell_debug_flags, + 0 ); + /* * SPU stuff */ diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 9ca153d52f..8cec9f45b2 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -163,6 +163,8 @@ struct cell_context struct spe_function attrib_fetch; unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS]; + + unsigned debug_flags; }; diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c index 5e75f409a3..2df90fdcb7 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.c +++ b/src/gallium/drivers/cell/ppu/cell_spu.c @@ -131,6 +131,7 @@ cell_start_spus(struct cell_context *cell) for (i = 0; i < cell->num_spus; i++) { cell_global.inits[i].id = i; cell_global.inits[i].num_spus = cell->num_spus; + cell_global.inits[i].debug_flags = cell->debug_flags; cell_global.inits[i].cmd = &cell_global.command[i]; for (j = 0; j < CELL_NUM_BUFFERS; j++) { cell_global.inits[i].buffers[j] = cell->buffer[j]; diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index e04ffeb9b1..0d4cdfae85 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -136,54 +136,75 @@ really_clear_tiles(uint surfaceIndex) static void cmd_clear_surface(const struct cell_command_clear_surface *clear) { - const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; - uint i; - if (Debug) printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id, clear->surface, clear->value); -#define CLEAR_OPT 1 -#if CLEAR_OPT - /* set all tile's status to CLEAR */ if (clear->surface == 0) { - memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status)); spu.fb.color_clear_value = clear->value; + if (spu.init.debug_flags & CELL_DEBUG_CHECKER) { + uint x = (spu.init.id << 4) | (spu.init.id << 12) | + (spu.init.id << 20) | (spu.init.id << 28); + spu.fb.color_clear_value ^= x; + } } else { - memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status)); spu.fb.depth_clear_value = clear->value; } - return; -#endif +#define CLEAR_OPT 1 +#if CLEAR_OPT + + /* Simply set all tiles' status to CLEAR. + * When we actually begin rendering into a tile, we'll initialize it to + * the clear value. If any tiles go untouched during the frame, + * really_clear_tiles() will set them to the clear value. + */ if (clear->surface == 0) { - spu.fb.color_clear_value = clear->value; - clear_c_tile(&spu.ctile); + memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status)); } else { - spu.fb.depth_clear_value = clear->value; - clear_z_tile(&spu.ztile); + memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status)); } +#else + + /* + * This path clears the whole framebuffer to the clear color right now. + */ + /* printf("SPU: %s num=%d w=%d h=%d\n", __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles); */ - for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { - uint tx = i % spu.fb.width_tiles; - uint ty = i / spu.fb.width_tiles; - if (clear->surface == 0) - put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); - else - put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1); - /* XXX we don't want this here, but it fixes bad tile results */ + /* init a single tile to the clear value */ + if (clear->surface == 0) { + clear_c_tile(&spu.ctile); + } + else { + clear_z_tile(&spu.ztile); } -#if 0 - wait_on_mask(1 << TAG_SURFACE_CLEAR); -#endif + /* walk over my tiles, writing the 'clear' tile's data */ + { + const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; + uint i; + for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { + uint tx = i % spu.fb.width_tiles; + uint ty = i / spu.fb.width_tiles; + if (clear->surface == 0) + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); + else + put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1); + } + } + + if (spu.init.debug_flags & CELL_DEBUG_SYNC) { + wait_on_mask(1 << TAG_SURFACE_CLEAR); + } + +#endif /* CLEAR_OPT */ if (Debug) printf("SPU %u: CLEAR SURF done\n", spu.init.id); -- cgit v1.2.3 From 284ab5a6127f8b452acaa0e10ac1d9ebc87fac3e Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 10 Sep 2008 18:22:00 -0600 Subject: cell: checkpoint commit of new per-fragment processing Do code generation for alpha test, z test, stencil, blend, colormask and framebuffer/tile read/write as a single code block. Ian's previous blend/z/stencil test code is still there but mostly disabled and will be removed soon. --- src/gallium/drivers/cell/common.h | 20 +- src/gallium/drivers/cell/ppu/Makefile | 1 + src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 530 +++++++++++++++++++++ src/gallium/drivers/cell/ppu/cell_gen_fragment.h | 38 ++ src/gallium/drivers/cell/ppu/cell_state_emit.c | 31 +- .../drivers/cell/ppu/cell_state_per_fragment.c | 2 +- src/gallium/drivers/cell/spu/Makefile | 2 +- src/gallium/drivers/cell/spu/spu_main.c | 53 ++- src/gallium/drivers/cell/spu/spu_main.h | 23 + src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 231 ++++++++- src/gallium/drivers/cell/spu/spu_per_fragment_op.h | 11 + src/gallium/drivers/cell/spu/spu_tri.c | 30 ++ src/gallium/winsys/xlib/xm_api.c | 7 +- src/gallium/winsys/xlib/xm_winsys.c | 35 ++ 14 files changed, 998 insertions(+), 16 deletions(-) create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.c create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index c0ca201e1d..a62530c64d 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -97,6 +97,7 @@ #define CELL_CMD_STATE_LOGICOP 21 #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 +#define CELL_CMD_STATE_FRAGMENT_OPS 24 #define CELL_NUM_BUFFERS 4 @@ -112,30 +113,43 @@ /** */ -struct cell_command_depth_stencil_alpha_test { +struct cell_command_depth_stencil_alpha_test +{ uint64_t base; /**< Effective address of code start. */ unsigned size; /**< Size in bytes of SPE code. */ unsigned read_depth; /**< Flag: should depth be read? */ unsigned read_stencil; /**< Flag: should stencil be read? */ + struct pipe_depth_stencil_alpha_state state; }; /** * Upload code to perform framebuffer blend operation */ -struct cell_command_blend { +struct cell_command_blend +{ uint64_t base; /**< Effective address of code start. */ unsigned size; /**< Size in bytes of SPE code. */ unsigned read_fb; /**< Flag: should framebuffer be read? */ }; -struct cell_command_logicop { +struct cell_command_logicop +{ uint64_t base; /**< Effective address of code start. */ unsigned size; /**< Size in bytes of SPE code. */ }; +#define SPU_MAX_FRAGMENT_OPS_INSTS 64 + +struct cell_command_fragment_ops +{ + uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ + unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS]; +}; + + /** * Tell SPUs about the framebuffer size, location */ diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index 25473e200c..b5a6fcb8de 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -25,6 +25,7 @@ SOURCES = \ cell_context.c \ cell_draw_arrays.c \ cell_flush.c \ + cell_gen_fragment.c \ cell_state_derived.c \ cell_state_emit.c \ cell_state_per_fragment.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c new file mode 100644 index 0000000000..df29476be6 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -0,0 +1,530 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +/** + * Generate SPU per-fragment code (actually per-quad code). + * \author Brian Paul + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "cell_context.h" +#include "cell_gen_fragment.h" + + + +/** Do extra optimizations? */ +#define OPTIMIZATIONS 1 + + +/** + * Generate SPE code to perform Z/depth testing. + * + * \param dsa Gallium depth/stencil/alpha state to gen code for + * \param f SPE function to append instruction onto. + * \param mask_reg register containing quad/pixel "alive" mask (in/out) + * \param ifragZ_reg register containing integer fragment Z values (in) + * \param ifbZ_reg register containing integer frame buffer Z values (in/out) + * \param zmask_reg register containing result of Z test/comparison (out) + */ +static void +gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, + int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg) +{ + ASSERT(dsa->depth.enabled); + + switch (dsa->depth.func) { + case PIPE_FUNC_EQUAL: + /* zmask = (ifragZ == ref) */ + spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_NOTEQUAL: + /* zmask = (ifragZ == ref) */ + spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_GREATER: + /* zmask = (ifragZ > ref) */ + spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_LESS: + /* zmask = (ref > ifragZ) */ + spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_LEQUAL: + /* zmask = (ifragZ > ref) */ + spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_GEQUAL: + /* zmask = (ref > ifragZ) */ + spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_NEVER: + spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */ + spe_move(f, zmask_reg, mask_reg); /* zmask = mask */ + break; + + case PIPE_FUNC_ALWAYS: + /* mask unchanged */ + spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */ + break; + + default: + ASSERT(0); + break; + } + + if (dsa->depth.writemask) { + /* + * If (ztest passed) { + * framebufferZ = fragmentZ; + * } + * OR, + * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ; + */ + spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg); + } +} + + +/** + * Generate SPE code to perform alpha testing. + * + * \param dsa Gallium depth/stencil/alpha state to gen code for + * \param f SPE function to append instruction onto. + * \param mask_reg register containing quad/pixel "alive" mask (in/out) + * \param fragA_reg register containing four fragment alpha values (in) + */ +static void +gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, int mask_reg, int fragA_reg) +{ + int ref_reg = spe_allocate_available_register(f); + int amask_reg = spe_allocate_available_register(f); + + ASSERT(dsa->alpha.enabled); + + if ((dsa->alpha.func != PIPE_FUNC_NEVER) && + (dsa->alpha.func != PIPE_FUNC_ALWAYS)) { + /* load/splat the alpha reference float value */ + spe_load_float(f, ref_reg, dsa->alpha.ref); + } + + /* emit code to do the alpha comparison, updating 'mask' */ + switch (dsa->alpha.func) { + case PIPE_FUNC_EQUAL: + /* amask = (fragA == ref) */ + spe_fceq(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_NOTEQUAL: + /* amask = (fragA == ref) */ + spe_fceq(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_GREATER: + /* amask = (fragA > ref) */ + spe_fcgt(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_LESS: + /* amask = (ref > fragA) */ + spe_fcgt(f, amask_reg, ref_reg, fragA_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_LEQUAL: + /* amask = (fragA > ref) */ + spe_fcgt(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_GEQUAL: + /* amask = (ref > fragA) */ + spe_fcgt(f, amask_reg, ref_reg, fragA_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_NEVER: + spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */ + break; + + case PIPE_FUNC_ALWAYS: + /* no-op, mask unchanged */ + break; + + default: + ASSERT(0); + break; + } + +#if OPTIMIZATIONS + /* if mask == {0,0,0,0} we're all done, return */ + { + /* re-use amask reg here */ + int tmp_reg = amask_reg; + /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */ + spe_orx(f, tmp_reg, mask_reg); + /* if tmp[0] == 0 then return from function call */ + spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0); + } +#endif + + spe_release_register(f, ref_reg); + spe_release_register(f, amask_reg); +} + + + +/** + * Generate SPE code to implement the fragment operations (alpha test, + * depth test, stencil test, blending, colormask, and final + * framebuffer write) as specified by the current context state. + * + * Logically, this code will be called after running the fragment + * shader. But under some circumstances we could run some of this + * code before the fragment shader to cull fragments/quads that are + * totally occluded/discarded. + * + * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now. + * + * See the spu_default_fragment_ops() function to see how the per-fragment + * operations would be done with ordinary C code. + * The code we generate here though has no branches, is SIMD, etc and + * should be much faster. + * + * \param cell the rendering context (in) + * \param f the generated function (out) + */ +void +gen_fragment_function(struct cell_context *cell, struct spe_function *f) +{ + const struct pipe_depth_stencil_alpha_state *dsa = + &cell->depth_stencil->base; + const struct pipe_blend_state *blend = &cell->blend->base; + + /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ + const int x_reg = 3; /* uint */ + const int y_reg = 4; /* uint */ + const int color_tile_reg = 5; /* tile_t * */ + const int depth_tile_reg = 6; /* tile_t * */ + const int fragZ_reg = 7; /* vector float */ + const int fragR_reg = 8; /* vector float */ + const int fragG_reg = 9; /* vector float */ + const int fragB_reg = 10; /* vector float */ + const int fragA_reg = 11; /* vector float */ + const int mask_reg = 12; /* vector uint */ + + /* offset of quad from start of tile + * XXX assuming 4-byte pixels for color AND Z/stencil!!!! + */ + int quad_offset_reg; + + int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */ + int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */ + + spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + spe_allocate_register(f, x_reg); + spe_allocate_register(f, y_reg); + spe_allocate_register(f, color_tile_reg); + spe_allocate_register(f, depth_tile_reg); + spe_allocate_register(f, fragZ_reg); + spe_allocate_register(f, fragR_reg); + spe_allocate_register(f, fragG_reg); + spe_allocate_register(f, fragB_reg); + spe_allocate_register(f, fragA_reg); + spe_allocate_register(f, mask_reg); + + quad_offset_reg = spe_allocate_available_register(f); + fbRGBA_reg = spe_allocate_available_register(f); + fbZS_reg = spe_allocate_available_register(f); + + /* compute offset of quad from start of tile, in bytes */ + { + int x2_reg = spe_allocate_available_register(f); + int y2_reg = spe_allocate_available_register(f); + + ASSERT(TILE_SIZE == 32); + + spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */ + spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */ + spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */ + spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */ + spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */ + + spe_release_register(f, x2_reg); + spe_release_register(f, y2_reg); + } + + + if (dsa->alpha.enabled) { + gen_alpha_test(dsa, f, mask_reg, fragA_reg); + } + + if (dsa->depth.enabled || dsa->stencil[0].enabled) { + const enum pipe_format zs_format = cell->framebuffer.zsbuf->format; + boolean write_depth_stencil; + + int fbZ_reg = spe_allocate_available_register(f); /* Z values */ + int fbS_reg = spe_allocate_available_register(f); /* Stencil values */ + + /* fetch quad of depth/stencil values from tile at (x,y) */ + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + + if (dsa->depth.enabled) { + /* Extract Z bits from fbZS_reg into fbZ_reg */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + int mask_reg = spe_allocate_available_register(f); + spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */ + spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */ + spe_release_register(f, mask_reg); + /* OK, fbZ_reg has four 24-bit Z values now */ + } + else { + /* XXX handle other z/stencil formats */ + ASSERT(0); + } + + /* Convert fragZ values from float[4] to uint[4] */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM || + zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + /* 24-bit Z values */ + int scale_reg = spe_allocate_available_register(f); + + /* scale_reg[0,1,2,3] = float(2^24-1) */ + spe_load_float(f, scale_reg, (float) 0xffffff); + + /* XXX these two instructions might be combined */ + spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */ + + spe_release_register(f, scale_reg); + } + else { + /* XXX handle 16-bit Z format */ + ASSERT(0); + } + } + + if (dsa->stencil[0].enabled) { + /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + /* XXX extract with a shift */ + ASSERT(0); + } + else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + /* XXX extract with a mask */ + ASSERT(0); + } + } + + + if (dsa->stencil[0].enabled) { + /* XXX this may involve depth testing too */ + // gen_stencil_test(dsa, f, ... ); + ASSERT(0); + } + else if (dsa->depth.enabled) { + int zmask_reg = spe_allocate_available_register(f); + gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); + spe_release_register(f, zmask_reg); + } + + /* do we need to write Z and/or Stencil back into framebuffer? */ + write_depth_stencil = (dsa->depth.writemask | + dsa->stencil[0].write_mask | + dsa->stencil[1].write_mask); + + if (write_depth_stencil) { + /* Merge latest Z and Stencil values into fbZS_reg. + * fbZ_reg has four Z vals in bits [23..0] or bits [15..0]. + * fbS_reg has four 8-bit Z values in bits [7..0]. + */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } + else if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + /* XXX to do */ + ASSERT(0); + } + else if (zs_format == PIPE_FORMAT_Z16_UNORM) { + /* XXX to do */ + ASSERT(0); + } + else if (zs_format == PIPE_FORMAT_S8_UNORM) { + /* XXX to do */ + ASSERT(0); + } + else { + /* bad zs_format */ + ASSERT(0); + } + + /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */ + spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + } + + spe_release_register(f, fbZ_reg); + spe_release_register(f, fbS_reg); + } + + + /* Get framebuffer quad/colors. We'll need these for blending, + * color masking, and to obey the quad/pixel mask. + * Load: fbRGBA_reg = memory[color_tile + quad_offset] + * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking + * we could skip this load. + */ + spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg); + + + if (blend->blend_enable) { + /* convert packed tile colors in fbRGBA_reg to float[4] vectors */ + + // gen_blend_code(blend, f, mask_reg, ... ); + + } + + + + /* + * Write fragment colors to framebuffer/tile. + * This involves converting the fragment colors from float[4] to the + * tile's specific format and obeying the quad/pixel mask. + */ + { + const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; + int rgba_reg = spe_allocate_available_register(f); + + /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ + spe_cfltu(f, fragR_reg, fragR_reg, 32); + spe_cfltu(f, fragG_reg, fragG_reg, 32); + spe_cfltu(f, fragB_reg, fragB_reg, 32); + spe_cfltu(f, fragA_reg, fragA_reg, 32); + + /* Shift most the significant bytes to least the significant positions. + * I.e.: reg = reg >> 24 + */ + spe_rotmi(f, fragR_reg, fragR_reg, -24); + spe_rotmi(f, fragG_reg, fragG_reg, -24); + spe_rotmi(f, fragB_reg, fragB_reg, -24); + spe_rotmi(f, fragA_reg, fragA_reg, -24); + + /* Shift the color bytes according to the surface format */ + if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) { + spe_roti(f, fragG_reg, fragG_reg, 8); /* green <<= 8 */ + spe_roti(f, fragR_reg, fragR_reg, 16); /* red <<= 16 */ + spe_roti(f, fragA_reg, fragA_reg, 24); /* alpha <<= 24 */ + } + else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) { + spe_roti(f, fragR_reg, fragR_reg, 8); /* red <<= 8 */ + spe_roti(f, fragG_reg, fragG_reg, 16); /* green <<= 16 */ + spe_roti(f, fragB_reg, fragB_reg, 24); /* blue <<= 24 */ + } + else { + ASSERT(0); + } + + /* Merge red, green, blue, alpha registers to make packed RGBA colors. + * Eg: after shifting according to color_format we might have: + * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000} + * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600} + * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099} + * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000} + * OR-ing all those together gives us four packed colors: + * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} + */ + spe_or(f, rgba_reg, fragR_reg, fragG_reg); + spe_or(f, rgba_reg, rgba_reg, fragB_reg); + spe_or(f, rgba_reg, rgba_reg, fragA_reg); + + /* Mix fragment colors with framebuffer colors using the quad/pixel mask: + * if (mask[i]) + * rgba[i] = rgba[i]; + * else + * rgba[i] = framebuffer[i]; + */ + spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg); + + /* Store updated quad in tile: + * memory[color_tile + quad_offset] = rgba_reg; + */ + spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg); + + spe_release_register(f, rgba_reg); + } + + printf("gen_fragment_ops nr instructions: %u\n", f->num_inst); + + spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */ + + + spe_release_register(f, fbRGBA_reg); + spe_release_register(f, fbZS_reg); + spe_release_register(f, quad_offset_reg); +} + diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h new file mode 100644 index 0000000000..0ea0fc690c --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h @@ -0,0 +1,38 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_GEN_FRAGMENT_H +#define CELL_GEN_FRAGMENT_H + + +extern void +gen_fragment_function(struct cell_context *cell, struct spe_function *f); + + +#endif /* CELL_GEN_FRAGMENT_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index f2feaa329a..06777aac14 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -27,6 +27,7 @@ #include "util/u_memory.h" #include "cell_context.h" +#include "cell_gen_fragment.h" #include "cell_state.h" #include "cell_state_emit.h" #include "cell_state_per_fragment.h" @@ -83,6 +84,29 @@ cell_emit_state(struct cell_context *cell) fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE; fb->width = cell->framebuffer.width; fb->height = cell->framebuffer.height; +#if 0 + printf("EMIT color format %s\n", pf_name(fb->color_format)); + printf("EMIT depth format %s\n", pf_name(fb->depth_format)); +#endif + } + + + if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) { + /* XXX we don't want to always do codegen here. We should have + * a hash/lookup table to cache previous results... + */ + struct cell_command_fragment_ops *fops + = cell_batch_alloc(cell, sizeof(*fops)); + struct spe_function spe_code; + + /* generate new code */ + gen_fragment_function(cell, &spe_code); + /* put the new code into the batch buffer */ + fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; + memcpy(&fops->code, spe_code.store, + SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + /* free codegen buffer */ + spe_release_func(&spe_code); } if (cell->dirty & CELL_NEW_BLEND) { @@ -90,8 +114,7 @@ cell_emit_state(struct cell_context *cell) if (cell->blend != NULL) { blend.base = (intptr_t) cell->blend->code.store; - blend.size = (char *) cell->blend->code.csr - - (char *) cell->blend->code.store; + blend.size = cell->blend->code.num_inst * SPE_INST_SIZE; blend.read_fb = TRUE; } else { @@ -108,10 +131,10 @@ cell_emit_state(struct cell_context *cell) if (cell->depth_stencil != NULL) { dsat.base = (intptr_t) cell->depth_stencil->code.store; - dsat.size = (char *) cell->depth_stencil->code.csr - - (char *) cell->depth_stencil->code.store; + dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE; dsat.read_depth = TRUE; dsat.read_stencil = FALSE; + dsat.state = cell->depth_stencil->base; } else { dsat.base = 0; diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c index 705867107b..78cb446c14 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c @@ -1158,7 +1158,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb) static int PC_OFFSET(const struct spe_function *f, const void *d) { - const intptr_t pc = (intptr_t) f->csr; + const intptr_t pc = (intptr_t) &f->store[f->num_inst]; const intptr_t ea = ~0x0f & (intptr_t) d; return (ea - pc) >> 2; diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index d49abb2e82..e285ae9fdb 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -43,7 +43,7 @@ INCLUDE_DIRS = \ $(SPU_CC) $(SPU_CFLAGS) -c $< .c.s: - $(SPU_CC) $(SPU_CFLAGS) -S $< + $(SPU_CC) $(SPU_CFLAGS) -O3 -S $< # The .a file will be linked into the main/PPU executable diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index c4236817a9..4e0ec15925 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -34,6 +34,7 @@ #include "spu_main.h" #include "spu_render.h" +#include "spu_per_fragment_op.h" #include "spu_texture.h" #include "spu_tile.h" //#include "spu_test.h" @@ -46,7 +47,7 @@ /* helpful headers: /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h -/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h +/opt/cell/sdk/usr/include/libmisc.h */ boolean Debug = FALSE; @@ -226,6 +227,24 @@ cmd_release_verts(const struct cell_command_release_verts *release) } +/** + * Process a CELL_CMD_STATE_FRAGMENT_OPS command. + * This involves installing new fragment ops SPU code. + * If this function is never called, we'll use a regular C fallback function + * for fragment processing. + */ +static void +cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) +{ + if (Debug) + printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); + /* Copy SPU code from batch buffer to spu buffer */ + memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + /* Point function pointer at new code */ + spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code; +} + + static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { @@ -257,6 +276,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) break; case PIPE_FORMAT_Z24S8_UNORM: case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: spu.fb.zsize = 4; spu.fb.zscale = (float) 0x00ffffffu; break; @@ -282,6 +303,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) } +#define NEW_FRAGMENT_FUNCTION 01 + static void cmd_state_blend(const struct cell_command_blend *state) { @@ -302,7 +325,9 @@ cmd_state_blend(const struct cell_command_blend *state) wait_on_mask(1 << TAG_BATCH_BUFFER); spu.blend = (blend_func) fb_blend_code_buffer; spu.read_fb = state->read_fb; - } else { + } + else + { spu.read_fb = FALSE; } } @@ -326,7 +351,9 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat 0, /* tid */ 0 /* rid */); wait_on_mask(1 << TAG_BATCH_BUFFER); - } else { + } + else + { /* If there is no code, emit a return instruction. */ depth_stencil_code_buffer[0] = 0x35; @@ -338,12 +365,14 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat spu.frag_test = (frag_test_func) depth_stencil_code_buffer; spu.read_depth = state->read_depth; spu.read_stencil = state->read_stencil; + spu.depth_stencil_alpha = state->state; } static void cmd_state_logicop(const struct cell_command_logicop * code) { +#if !NEW_FRAGMENT_FUNCTION mfc_get(logicop_code_buffer, (unsigned int) code->base, /* src */ code->size, @@ -353,6 +382,7 @@ cmd_state_logicop(const struct cell_command_logicop * code) wait_on_mask(1 << TAG_BATCH_BUFFER); spu.logicop = (logicop_func) logicop_code_buffer; +#endif } @@ -455,7 +485,9 @@ cmd_finish(void) /** - * Execute a batch of commands + * Execute a batch of commands which was sent to us by the PPU. + * See the cell_emit_state.c code to see where the commands come from. + * * The opcode param encodes the location of the buffer and its size. */ static void @@ -519,6 +551,14 @@ cmd_batch(uint opcode) pos += pos_incr; } break; + case CELL_CMD_STATE_FRAGMENT_OPS: + { + struct cell_command_fragment_ops *fops + = (struct cell_command_fragment_ops *) &buffer[pos]; + cmd_state_fragment_ops(fops); + pos += sizeof(*fops) / 8; + } + break; case CELL_CMD_RELEASE_VERTS: { struct cell_command_release_verts *release @@ -680,6 +720,11 @@ one_time_init(void) memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status)); memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status)); invalidate_tex_cache(); + + /* Install default/fallback fragment processing function. + * This will normally be overriden by a code-gen'd function. + */ + spu.fragment_ops.func = spu_fallback_fragment_ops; } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index c2a53c9dcf..7ab34f5222 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -91,6 +91,24 @@ typedef struct spu_blend_results (*logicop_func)( typedef vector float (*sample_texture_func)(uint unit, vector float texcoord); + +typedef void (*spu_fragment_ops_func)(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask); + +struct spu_fragment_ops +{ + uint code[SPU_MAX_FRAGMENT_OPS_INSTS]; + spu_fragment_ops_func func; /**< Current fragment ops function */ +} ALIGN16_ATTRIB; + + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ void *depth_start; /**< addr of depth surface in main memory */ @@ -127,6 +145,9 @@ struct spu_global struct cell_init_info init; struct spu_framebuffer fb; + + struct pipe_depth_stencil_alpha_state depth_stencil_alpha; + boolean read_depth; boolean read_stencil; frag_test_func frag_test; /**< Current depth/stencil test code */ @@ -142,6 +163,8 @@ struct spu_global struct vertex_info vertex_info; + struct spu_fragment_ops fragment_ops; + /* XXX more state to come */ diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index 29dc07a2e8..ffc596aa62 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -29,8 +29,11 @@ * \author Ian Romanick */ + +#include #include "pipe/p_format.h" #include "spu_main.h" +#include "spu_colorpack.h" #include "spu_per_fragment_op.h" #define ZERO 0x80 @@ -90,7 +93,8 @@ read_ds_quad(tile_t *tile, unsigned x, unsigned y, break; } - case PIPE_FORMAT_S8Z24_UNORM: { + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: { qword *ptr = (qword *) &tile->ui4[iy][ix]; *depth = si_and(*ptr, si_fsmbi(0x7777)); @@ -153,7 +157,8 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y, break; } - case PIPE_FORMAT_S8Z24_UNORM: { + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: { qword *ptr = (qword *) &buffer->ui4[iy][ix]; /* form select mask = 0111,0111,0111,0111 */ qword mask = si_fsmbi(0x7777); @@ -217,3 +222,225 @@ spu_do_depth_stencil(int x, int y, return result.mask; } + + + + +/** + * Called by rasterizer for each quad after the shader has run. This + * is a fallback/debug function. In reality we'll use a generated + * function produced by the PPU. But this function is useful for + * debug/validation. + */ +void +spu_fallback_fragment_ops(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask) +{ + vector float frag_soa[4], frag_aos[4]; + unsigned int c0, c1, c2, c3; + + /* do alpha test */ + if (spu.depth_stencil_alpha.alpha.enabled) { + vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref); + vector unsigned int amask; + + switch (spu.depth_stencil_alpha.alpha.func) { + case PIPE_FUNC_LESS: + amask = spu_cmpgt(ref, fragAlpha); /* mask = (fragAlpha < ref) */ + break; + case PIPE_FUNC_GREATER: + amask = spu_cmpgt(fragAlpha, ref); /* mask = (fragAlpha > ref) */ + break; + case PIPE_FUNC_GEQUAL: + amask = spu_cmpgt(ref, fragAlpha); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_LEQUAL: + amask = spu_cmpgt(fragAlpha, ref); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_EQUAL: + amask = spu_cmpeq(ref, fragAlpha); + break; + case PIPE_FUNC_NOTEQUAL: + amask = spu_cmpeq(ref, fragAlpha); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_ALWAYS: + amask = spu_splats(0xffffffffU); + break; + case PIPE_FUNC_NEVER: + amask = spu_splats( 0x0U); + break; + default: + ; + } + + mask = spu_and(mask, amask); + } + + /* Z and/or stencil testing... */ + if (spu.depth_stencil_alpha.depth.enabled || + spu.depth_stencil_alpha.stencil[0].enabled) { + + /* get four Z/Stencil values from tile */ + vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU); + vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2]; + vector unsigned int ifbZ = spu_and(ifbZS, mask24); + vector unsigned int ifbS = spu_andc(ifbZS, mask24); + + if (spu.depth_stencil_alpha.stencil[0].enabled) { + /* do stencil test */ + ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM); + + } + else if (spu.depth_stencil_alpha.depth.enabled) { + /* do depth test */ + + ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM || + spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM); + + vector unsigned int ifragZ; + vector unsigned int zmask; + + /* convert four fragZ from float to uint */ + fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff)); + ifragZ = spu_convtu(fragZ, 0); + + /* do depth comparison, setting zmask with results */ + switch (spu.depth_stencil_alpha.depth.func) { + case PIPE_FUNC_LESS: + zmask = spu_cmpgt(ifbZ, ifragZ); /* mask = (ifragZ < ifbZ) */ + break; + case PIPE_FUNC_GREATER: + zmask = spu_cmpgt(ifragZ, ifbZ); /* mask = (ifbZ > ifragZ) */ + break; + case PIPE_FUNC_GEQUAL: + zmask = spu_cmpgt(ifbZ, ifragZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_LEQUAL: + zmask = spu_cmpgt(ifragZ, ifbZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_EQUAL: + zmask = spu_cmpeq(ifbZ, ifragZ); + break; + case PIPE_FUNC_NOTEQUAL: + zmask = spu_cmpeq(ifbZ, ifragZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_ALWAYS: + zmask = spu_splats(0xffffffffU); + break; + case PIPE_FUNC_NEVER: + zmask = spu_splats( 0x0U); + break; + default: + ; + } + + mask = spu_and(mask, zmask); + + /* merge framebuffer Z and fragment Z according to the mask */ + ifbZ = spu_or(spu_and(ifragZ, mask), + spu_andc(ifbZ, mask)); + } + + if (spu_extract(spu_orx(mask), 0)) { + /* put new fragment Z/Stencil values back into Z/Stencil tile */ + depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS); + + spu.cur_ztile_status = TILE_STATUS_DIRTY; + } + } + + /* XXX do blending here */ + + /* XXX do colormask test here */ + + + if (spu_extract(spu_orx(mask), 0)) { + spu.cur_ctile_status = TILE_STATUS_DIRTY; + } + else { + return; + } + + /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */ +#if 0 + { + vector float frag_soa[4]; + frag_soa[0] = fragRed; + frag_soa[1] = fragGreen; + frag_soa[2] = fragBlue; + frag_soa[3] = fragAlpha; + _transpose_matrix4x4(frag_aos, frag_soa); + } +#else + /* short-cut relying on function parameter layout: */ + _transpose_matrix4x4(frag_aos, &fragRed); + (void) fragGreen; + (void) fragBlue; +#endif + + switch (spu.fb.color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + c0 = spu_pack_A8R8G8B8(frag_aos[0]); + c1 = spu_pack_A8R8G8B8(frag_aos[1]); + c2 = spu_pack_A8R8G8B8(frag_aos[2]); + c3 = spu_pack_A8R8G8B8(frag_aos[3]); + break; + + case PIPE_FORMAT_B8G8R8A8_UNORM: + c0 = spu_pack_B8G8R8A8(frag_aos[0]); + c1 = spu_pack_B8G8R8A8(frag_aos[1]); + c2 = spu_pack_B8G8R8A8(frag_aos[2]); + c3 = spu_pack_B8G8R8A8(frag_aos[3]); + break; + default: + fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n"); + ASSERT(0); + } + +#if 0 + /* + * Quad layout: + * +--+--+ + * |p0|p1| + * +--+--+ + * |p2|p3| + * +--+--+ + */ + if (spu_extract(mask, 0)) + colorTile->ui[y+0][x+0] = c0; + if (spu_extract(mask, 1)) + colorTile->ui[y+0][x+1] = c1; + if (spu_extract(mask, 2)) + colorTile->ui[y+1][x+0] = c2; + if (spu_extract(mask, 3)) + colorTile->ui[y+1][x+1] = c3; +#else + /* + * Quad layout: + * +--+--+--+--+ + * |p0|p1|p2|p3| + * +--+--+--+--+ + */ + if (spu_extract(mask, 0)) + colorTile->ui[y][x*2] = c0; + if (spu_extract(mask, 1)) + colorTile->ui[y][x*2+1] = c1; + if (spu_extract(mask, 2)) + colorTile->ui[y][x*2+2] = c2; + if (spu_extract(mask, 3)) + colorTile->ui[y][x*2+3] = c3; +#endif +} diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h index 6571258699..ffadf0661c 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -29,4 +29,15 @@ extern qword spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth, qword frag_alpha, qword facing); +extern void +spu_fallback_fragment_ops(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask); + #endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index a3ea0a3e69..71ef6ca24f 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -297,9 +297,12 @@ emit_quad( int x, int y, mask_t mask ) sp->quad.first->run(sp->quad.first, &setup.quad); #else +#define NEW_FRAGMENT_FUNCTION 01 +#if !NEW_FRAGMENT_FUNCTION if (spu.read_depth) { mask = do_depth_test(x, y, mask); } +#endif /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { @@ -308,6 +311,7 @@ emit_quad( int x, int y, mask_t mask ) vector float colors[4]; spu.cur_ctile_status = TILE_STATUS_DIRTY; + spu.cur_ztile_status = TILE_STATUS_DIRTY; if (spu.texture[0].start) { /* texture mapping */ @@ -355,6 +359,29 @@ emit_quad( int x, int y, mask_t mask ) } +#if NEW_FRAGMENT_FUNCTION + { + /* Convert fragment data from AoS to SoA format. + * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) + * This is temporary! + */ + vector float soa_frag[4]; + _transpose_matrix4x4(soa_frag, colors); + + float4 fragZ; + + fragZ.v = eval_z((float) x, (float) y); + + /* Do all per-fragment/quad operations here, including: + * alpha test, z test, stencil test, blend and framebuffer writing. + */ + spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile, + fragZ.v, + soa_frag[0], soa_frag[1], + soa_frag[2], soa_frag[3], + mask); + } +#else /* Convert fragment data from AoS to SoA format. * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) */ @@ -405,6 +432,9 @@ emit_quad( int x, int y, mask_t mask ) spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0); spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0); spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0); + +#endif /* NEW_FRAGMENT_FUNCTION */ + } #endif } diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c index b010513107..28bd6ceab4 100644 --- a/src/gallium/winsys/xlib/xm_api.c +++ b/src/gallium/winsys/xlib/xm_api.c @@ -349,12 +349,17 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type, if (vis->mesa_visual.depthBits == 0) depthFormat = PIPE_FORMAT_NONE; +#ifdef GALLIUM_CELL /* XXX temporary for Cell! */ + else + depthFormat = PIPE_FORMAT_S8Z24_UNORM; +#else else if (vis->mesa_visual.depthBits <= 16) - depthFormat = PIPE_FORMAT_Z16_UNORM; + depthFormat = PIPE_FORMAT_Z16UNORM; else if (vis->mesa_visual.depthBits <= 24) depthFormat = PIPE_FORMAT_S8Z24_UNORM; else depthFormat = PIPE_FORMAT_Z32_UNORM; +#endif if (vis->mesa_visual.stencilBits == 8) { if (depthFormat == PIPE_FORMAT_S8Z24_UNORM) diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c index 5e9a1f92f1..c4a30d3702 100644 --- a/src/gallium/winsys/xlib/xm_winsys.c +++ b/src/gallium/winsys/xlib/xm_winsys.c @@ -275,6 +275,39 @@ xm_buffer_destroy(struct pipe_winsys *pws, } +/** + * For Cell. Basically, rearrange the pixels/quads from this layout: + * +--+--+--+--+ + * |p0|p1|p2|p3|.... + * +--+--+--+--+ + * + * to this layout: + * +--+--+ + * |p0|p1|.... + * +--+--+ + * |p2|p3| + * +--+--+ + */ +static void +twiddle_tile(uint *tile) +{ + uint tile2[TILE_SIZE * TILE_SIZE]; + int y, x; + + for (y = 0; y < TILE_SIZE; y+=2) { + for (x = 0; x < TILE_SIZE; x+=2) { + int k = 4 * (y/2 * TILE_SIZE/2 + x/2); + tile2[y * TILE_SIZE + (x + 0)] = tile[k]; + tile2[y * TILE_SIZE + (x + 1)] = tile[k+1]; + tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2]; + tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3]; + } + } + memcpy(tile, tile2, sizeof(tile2)); +} + + + /** * Display a surface that's in a tiled configuration. That is, all the * pixels for a TILE_SIZExTILE_SIZE block are contiguous in memory. @@ -321,6 +354,8 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf) ximage->data = (char *) xm_buf->data + offset; + twiddle_tile((uint *) ximage->data); + if (XSHM_ENABLED(xm_buf)) { #if defined(USE_XSHM) && !defined(XFree86Server) XShmPutImage(b->xm_visual->display, b->drawable, b->gc, -- cgit v1.2.3 From 283ffdf99605c536d00e03ad6ec91a6f8e006fc2 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:13:20 -0600 Subject: cell: checkpoint: remove more of the old per-fragment code --- src/gallium/drivers/cell/common.h | 2 + src/gallium/drivers/cell/ppu/Makefile | 1 - src/gallium/drivers/cell/ppu/cell_state_emit.c | 60 ++----------- src/gallium/drivers/cell/spu/spu_main.c | 115 +++---------------------- src/gallium/drivers/cell/spu/spu_main.h | 37 +------- 5 files changed, 19 insertions(+), 196 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index a62530c64d..61d2b7d1ae 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -146,6 +146,8 @@ struct cell_command_logicop struct cell_command_fragment_ops { uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ + struct pipe_depth_stencil_alpha_state dsa; + struct pipe_blend_state blend; unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS]; }; diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index b5a6fcb8de..8699f3f8ec 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -28,7 +28,6 @@ SOURCES = \ cell_gen_fragment.c \ cell_state_derived.c \ cell_state_emit.c \ - cell_state_per_fragment.c \ cell_state_shader.c \ cell_pipe_state.c \ cell_screen.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 06777aac14..2bfb976c59 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -55,23 +55,6 @@ emit_state_cmd(struct cell_context *cell, uint cmd, void cell_emit_state(struct cell_context *cell) { - if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) { - struct cell_command_logicop logicop; - - if (cell->logic_op.store != NULL) { - spe_release_func(& cell->logic_op); - } - - cell_generate_logic_op(& cell->logic_op, - & cell->blend->base, - cell->framebuffer.cbufs[0]); - - logicop.base = (intptr_t) cell->logic_op.store; - logicop.size = 64 * 4; - emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop, - sizeof(logicop)); - } - if (cell->dirty & CELL_NEW_FRAMEBUFFER) { struct pipe_surface *cbuf = cell->framebuffer.cbufs[0]; struct pipe_surface *zbuf = cell->framebuffer.zsbuf; @@ -91,7 +74,9 @@ cell_emit_state(struct cell_context *cell) } - if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) { + if (cell->dirty & (CELL_NEW_FRAMEBUFFER | + CELL_NEW_DEPTH_STENCIL | + CELL_NEW_BLEND)) { /* XXX we don't want to always do codegen here. We should have * a hash/lookup table to cache previous results... */ @@ -105,47 +90,12 @@ cell_emit_state(struct cell_context *cell) fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; memcpy(&fops->code, spe_code.store, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + fops->dsa = cell->depth_stencil->base; + fops->blend = cell->blend->base; /* free codegen buffer */ spe_release_func(&spe_code); } - if (cell->dirty & CELL_NEW_BLEND) { - struct cell_command_blend blend; - - if (cell->blend != NULL) { - blend.base = (intptr_t) cell->blend->code.store; - blend.size = cell->blend->code.num_inst * SPE_INST_SIZE; - blend.read_fb = TRUE; - } - else { - blend.base = 0; - blend.size = 0; - blend.read_fb = FALSE; - } - - emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend)); - } - - if (cell->dirty & CELL_NEW_DEPTH_STENCIL) { - struct cell_command_depth_stencil_alpha_test dsat; - - if (cell->depth_stencil != NULL) { - dsat.base = (intptr_t) cell->depth_stencil->code.store; - dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE; - dsat.read_depth = TRUE; - dsat.read_stencil = FALSE; - dsat.state = cell->depth_stencil->base; - } - else { - dsat.base = 0; - dsat.size = 0; - dsat.read_depth = FALSE; - dsat.read_stencil = FALSE; - } - - emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat, sizeof(dsat)); - } - if (cell->dirty & CELL_NEW_SAMPLER) { uint i; for (i = 0; i < CELL_MAX_SAMPLERS; i++) { diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 4e0ec15925..6afca19dfd 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -63,14 +63,6 @@ struct spu_vs_context draw; static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS] ALIGN16_ATTRIB; -static unsigned char depth_stencil_code_buffer[4 * 64] - ALIGN16_ATTRIB; - -static unsigned char fb_blend_code_buffer[4 * 64] - ALIGN16_ATTRIB; - -static unsigned char logicop_code_buffer[4 * 64] - ALIGN16_ATTRIB; /** @@ -240,8 +232,15 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + /* Copy state info */ + memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); + memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); + /* Point function pointer at new code */ spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code; + + spu.read_depth = spu.depth_stencil_alpha.depth.enabled; + spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; } @@ -303,89 +302,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) } -#define NEW_FRAGMENT_FUNCTION 01 - -static void -cmd_state_blend(const struct cell_command_blend *state) -{ - if (Debug) - printf("SPU %u: BLEND: enabled %d\n", - spu.init.id, - (state->size != 0)); - - ASSERT_ALIGN16(state->base); - - if (state->size != 0) { - mfc_get(fb_blend_code_buffer, - (unsigned int) state->base, /* src */ - ROUNDUP16(state->size), - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - spu.blend = (blend_func) fb_blend_code_buffer; - spu.read_fb = state->read_fb; - } - else - { - spu.read_fb = FALSE; - } -} - - -static void -cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state) -{ - if (Debug) - printf("SPU %u: DEPTH_STENCIL: ztest %d\n", - spu.init.id, - state->read_depth); - - ASSERT_ALIGN16(state->base); - - if (state->size != 0) { - mfc_get(depth_stencil_code_buffer, - (unsigned int) state->base, /* src */ - ROUNDUP16(state->size), - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - } - else - { - /* If there is no code, emit a return instruction. - */ - depth_stencil_code_buffer[0] = 0x35; - depth_stencil_code_buffer[1] = 0x00; - depth_stencil_code_buffer[2] = 0x00; - depth_stencil_code_buffer[3] = 0x00; - } - - spu.frag_test = (frag_test_func) depth_stencil_code_buffer; - spu.read_depth = state->read_depth; - spu.read_stencil = state->read_stencil; - spu.depth_stencil_alpha = state->state; -} - - -static void -cmd_state_logicop(const struct cell_command_logicop * code) -{ -#if !NEW_FRAGMENT_FUNCTION - mfc_get(logicop_code_buffer, - (unsigned int) code->base, /* src */ - code->size, - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - - spu.logicop = (logicop_func) logicop_code_buffer; -#endif -} - - static void cmd_state_sampler(const struct cell_command_sampler *sampler) { @@ -571,15 +487,6 @@ cmd_batch(uint opcode) cmd_finish(); pos += 1; break; - case CELL_CMD_STATE_BLEND: - cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8); - break; - case CELL_CMD_STATE_DEPTH_STENCIL: - cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *) - &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8); - break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler @@ -614,19 +521,17 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); break; case CELL_CMD_STATE_BIND_VS: +#if 01 spu_bind_vertex_shader(&draw, (struct cell_shader_info *) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); +#endif break; case CELL_CMD_STATE_ATTRIB_FETCH: cmd_state_attrib_fetch((struct cell_attribute_fetch_code *) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); break; - case CELL_CMD_STATE_LOGICOP: - cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8); - break; case CELL_CMD_FLUSH_BUFFER_RANGE: { struct cell_buffer_range *br = (struct cell_buffer_range *) &buffer[pos+1]; @@ -695,7 +600,9 @@ main_loop(void) exitFlag = 1; break; case CELL_CMD_VS_EXECUTE: +#if 01 spu_execute_vertex_shader(&draw, &cmd.vs); +#endif break; case CELL_CMD_BATCH: cmd_batch(opcode); diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 7ab34f5222..f0f8be47db 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -60,35 +60,6 @@ typedef union { #define TILE_STATUS_GETTING 5 /**< mfc_get() called but not yet arrived */ -struct spu_frag_test_results { - qword mask; - qword depth; - qword stencil; -}; - -typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask, - qword pixel_depth, qword pixel_stencil, qword frag_depth, - qword frag_alpha, qword facing); - - -struct spu_blend_results { - qword r; - qword g; - qword b; - qword a; -}; - -typedef struct spu_blend_results (*blend_func)( - qword frag_r, qword frag_g, qword frag_b, qword frag_a, - qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, - qword const_r, qword const_g, qword const_b, qword const_a); - -typedef struct spu_blend_results (*logicop_func)( - qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, - qword frag_r, qword frag_g, qword frag_b, qword frag_a, - qword frag_mask); - - typedef vector float (*sample_texture_func)(uint unit, vector float texcoord); @@ -147,16 +118,10 @@ struct spu_global struct spu_framebuffer fb; struct pipe_depth_stencil_alpha_state depth_stencil_alpha; + struct pipe_blend_state blend; boolean read_depth; boolean read_stencil; - frag_test_func frag_test; /**< Current depth/stencil test code */ - - boolean read_fb; /**< Does current blend mode require framebuffer read? */ - blend_func blend; /**< Current blend code */ - qword const_blend_color[4] ALIGN16_ATTRIB; - - logicop_func logicop; /**< Current logicop code **/ struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; struct spu_texture texture[PIPE_MAX_SAMPLERS]; -- cgit v1.2.3 From f19903aa83e9b6e18930cbda14cfec3cca2e1bf2 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:26:00 -0600 Subject: cell: remove old blend/depth/stencil/logicop structs --- src/gallium/drivers/cell/common.h | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 61d2b7d1ae..8aa2b23ec0 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -111,35 +111,6 @@ #define CELL_DEBUG_SYNC (1 << 1) -/** - */ -struct cell_command_depth_stencil_alpha_test -{ - uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of SPE code. */ - unsigned read_depth; /**< Flag: should depth be read? */ - unsigned read_stencil; /**< Flag: should stencil be read? */ - struct pipe_depth_stencil_alpha_state state; -}; - - -/** - * Upload code to perform framebuffer blend operation - */ -struct cell_command_blend -{ - uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of SPE code. */ - unsigned read_fb; /**< Flag: should framebuffer be read? */ -}; - - -struct cell_command_logicop -{ - uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of SPE code. */ -}; - #define SPU_MAX_FRAGMENT_OPS_INSTS 64 -- cgit v1.2.3 From 73c6ae98c1c60635883a733f36d59d246e74aa2a Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:38:37 -0600 Subject: cell: remove old state CMDs, added comments --- src/gallium/drivers/cell/common.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 8aa2b23ec0..e989d8c2e5 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -84,7 +84,7 @@ #define CELL_CMD_BATCH 5 #define CELL_CMD_RELEASE_VERTS 6 #define CELL_CMD_STATE_FRAMEBUFFER 10 -#define CELL_CMD_STATE_DEPTH_STENCIL 11 +#define CELL_CMD_STATE_FRAGMENT_OPS 11 #define CELL_CMD_STATE_SAMPLER 12 #define CELL_CMD_STATE_TEXTURE 13 #define CELL_CMD_STATE_VERTEX_INFO 14 @@ -92,12 +92,9 @@ #define CELL_CMD_STATE_UNIFORMS 16 #define CELL_CMD_STATE_VS_ARRAY_INFO 17 #define CELL_CMD_STATE_BIND_VS 18 -#define CELL_CMD_STATE_BLEND 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 -#define CELL_CMD_STATE_LOGICOP 21 #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 -#define CELL_CMD_STATE_FRAGMENT_OPS 24 #define CELL_NUM_BUFFERS 4 @@ -112,8 +109,13 @@ +/** Max instructions for doing per-fragment operations */ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 + +/** + * Command to specify per-fragment operations state and generated code. + */ struct cell_command_fragment_ops { uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ @@ -159,13 +161,15 @@ struct cell_array_info }; -struct cell_attribute_fetch_code { +struct cell_attribute_fetch_code +{ uint64_t base; uint size; }; -struct cell_buffer_range { +struct cell_buffer_range +{ uint64_t base; unsigned size; }; -- cgit v1.2.3 From aa66f08a21b791f338b519f0c2162cd8f7b3aeb0 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 17:59:52 -0600 Subject: cell: initial support for fragment shader code generation. TGSI shaders are translated into SPE instructions which are then sent to the SPEs for execution. Only a few opcodes work, no swizzling yet, no support for constants/immediates, etc. --- src/gallium/drivers/cell/common.h | 15 + src/gallium/drivers/cell/ppu/Makefile | 1 + src/gallium/drivers/cell/ppu/cell_context.h | 1 + src/gallium/drivers/cell/ppu/cell_gen_fp.c | 523 +++++++++++++++++++++++ src/gallium/drivers/cell/ppu/cell_gen_fp.h | 42 ++ src/gallium/drivers/cell/ppu/cell_state_emit.c | 16 + src/gallium/drivers/cell/ppu/cell_state_shader.c | 8 +- src/gallium/drivers/cell/spu/spu_main.c | 25 +- src/gallium/drivers/cell/spu/spu_main.h | 15 + src/gallium/drivers/cell/spu/spu_tri.c | 35 ++ 10 files changed, 678 insertions(+), 3 deletions(-) create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.c create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index e989d8c2e5..cb0631baf5 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -92,6 +92,7 @@ #define CELL_CMD_STATE_UNIFORMS 16 #define CELL_CMD_STATE_VS_ARRAY_INFO 17 #define CELL_CMD_STATE_BIND_VS 18 +#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 @@ -125,6 +126,20 @@ struct cell_command_fragment_ops }; +/** Max instructions for fragment programs */ +#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128 + +/** + * Command to send a fragment progra to SPUs. + */ +struct cell_command_fragment_program +{ + uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */ + uint num_inst; /**< Number of instructions */ + unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; +}; + + /** * Tell SPUs about the framebuffer size, location */ diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index 8699f3f8ec..b28f4c5c31 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -26,6 +26,7 @@ SOURCES = \ cell_draw_arrays.c \ cell_flush.c \ cell_gen_fragment.c \ + cell_gen_fp.c \ cell_state_derived.c \ cell_state_emit.c \ cell_state_shader.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 8cec9f45b2..14914b9c6f 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -61,6 +61,7 @@ struct cell_fragment_shader_state { struct pipe_shader_state shader; struct tgsi_shader_info info; + struct spe_function code; void *data; }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c new file mode 100644 index 0000000000..6ffe94eb14 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -0,0 +1,523 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +/** + * Generate SPU fragment program/shader code. + * + * Note that we generate SOA-style code here. So each TGSI instruction + * operates on four pixels (and is translated into four SPU instructions, + * generally speaking). + * + * \author Brian Paul + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_exec.h" +#include "tgsi/tgsi_dump.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "util/u_memory.h" +#include "cell_context.h" +#include "cell_gen_fp.h" + + +/** Set to 1 to enable debug/disassembly printfs */ +#define DISASSEM 01 + + +/** + * Context needed during code generation. + */ +struct codegen +{ + int inputs_reg; /**< 1st function parameter */ + int outputs_reg; /**< 2nd function parameter */ + int constants_reg; /**< 3rd function parameter */ + int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */ + + int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */ + + /** Per-instruction temps / intermediate temps */ + int num_itemps; + int itemps[3]; + + struct spe_function *f; + boolean error; +}; + + +/** + * Allocate an intermediate temporary register. + */ +static int +get_itemp(struct codegen *gen) +{ + int t = spe_allocate_available_register(gen->f); + assert(gen->num_itemps < Elements(gen->itemps)); + gen->itemps[gen->num_itemps++] = t; + return t; +} + +/** + * Free all intermediate temporary registers. To be called after each + * instruction has been emitted. + */ +static void +free_itemps(struct codegen *gen) +{ + int i; + for (i = 0; i < gen->num_itemps; i++) { + spe_release_register(gen->f, gen->itemps[i]); + } + gen->num_itemps = 0; +} + + +/** + * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}. + * The register is allocated and initialized upon the first call. + */ +static int +get_const_one_reg(struct codegen *gen) +{ + if (gen->one_reg <= 0) { + gen->one_reg = spe_allocate_available_register(gen->f); + } + + /* one = {1.0, 1.0, 1.0, 1.0} */ + spe_load_float(gen->f, gen->one_reg, 1.0f); +#if DISASSEM + printf("il\tr%d, 1.0f\n", gen->one_reg); +#endif + + return gen->one_reg; +} + + +/** + * Return the index of the SPU temporary containing the named TGSI + * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we + * just return the corresponding SPE register. If the TGIS register + * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register + * and emit an SPE load instruction. + */ +static int +get_src_reg(struct codegen *gen, + int channel, + const struct tgsi_full_src_register *src) +{ + int reg; + + /* XXX need to examine src swizzle info here. + * That will involve changing the channel var... + */ + + + switch (src->SrcRegister.File) { + case TGSI_FILE_TEMPORARY: + reg = gen->temp_regs[src->SrcRegister.Index][channel]; + break; + case TGSI_FILE_INPUT: + { + /* offset is measured in quadwords, not bytes */ + int offset = src->SrcRegister.Index * 4 + channel; + reg = get_itemp(gen); + /* Load: reg = memory[(machine_reg) + offset] */ + spe_lqd(gen->f, reg, gen->inputs_reg, offset); +#if DISASSEM + printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset); +#endif + } + break; + case TGSI_FILE_IMMEDIATE: + /* xxx fall-through for now / fix */ + case TGSI_FILE_CONSTANT: + /* xxx fall-through for now / fix */ + default: + assert(0); + } + + return reg; +} + + +/** + * Return the index of an SPE register to use for the given TGSI register. + * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the + * corresponding SPE register is returned. If the TGSI register is + * TGSI_FILE_OUTPUT we allocate an intermediate temporary register. + * See store_dest_reg() below... + */ +static int +get_dst_reg(struct codegen *gen, + int channel, + const struct tgsi_full_dst_register *dest) +{ + int reg; + + switch (dest->DstRegister.File) { + case TGSI_FILE_TEMPORARY: + reg = gen->temp_regs[dest->DstRegister.Index][channel]; + break; + case TGSI_FILE_OUTPUT: + reg = get_itemp(gen); + break; + default: + assert(0); + } + + return reg; +} + + +/** + * When a TGSI instruction is writing to an output register, this + * function emits the SPE store instruction to store the value_reg. + * \param value_reg the SPE register containing the value to store. + * This would have been returned by get_dst_reg(). + */ +static void +store_dest_reg(struct codegen *gen, + int value_reg, int channel, + const struct tgsi_full_dst_register *dest) +{ + switch (dest->DstRegister.File) { + case TGSI_FILE_TEMPORARY: + /* no-op */ + break; + case TGSI_FILE_OUTPUT: + { + /* offset is measured in quadwords, not bytes */ + int offset = dest->DstRegister.Index * 4 + channel; + /* Store: memory[(machine_reg) + offset] = reg */ + spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); +#if DISASSEM + printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset); +#endif + } + break; + default: + assert(0); + } +} + + +static boolean +emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* XXX we don't always need to actually emit a mov instruction here */ + spe_move(gen->f, dst_reg, src_reg); +#if DISASSEM + printf("mov\tr%d, r%d\n", dst_reg, src_reg); +#endif + store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD + * becomes (up to) four SPU "fa" instructions because we're doing SOA + * processing. + */ +static boolean +emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + /* Loop over Red/Green/Blue/Alpha channels */ + for (ch = 0; ch < 4; ch++) { + /* If the dest R, G, B or A writemask is enabled... */ + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + /* get indexes of the two src, one dest SPE registers */ + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* Emit actual SPE instruction: d = s1 + s2 */ + spe_fa(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM + printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + + /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + /* Free any intermediate temps we allocated */ + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit multiply. See emit_ADD for comments. + */ +static boolean +emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = s1 * s2 */ + spe_fm(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM + printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit set-if-greater-than. + * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as + * the result but OpenGL/TGSI needs 0.0 and 1.0 results. + * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND. + */ +static boolean +emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 > s2) */ + spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM + printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); +#if DISASSEM + printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen)); +#endif + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + + +/** + * Emit END instruction. + * We just return from the shader function at this point. + * + * Note that there may be more code after this that would be + * called by TGSI_OPCODE_CALL. + */ +static boolean +emit_END(struct codegen *gen) +{ + /* return from function call */ + spe_bi(gen->f, SPE_REG_RA, 0, 0); +#if DISASSEM + printf("bi\trRA\n"); +#endif + return true; +} + + +/** + * Emit code for the given instruction. Just a big switch stmt. + */ +static boolean +emit_instruction(struct codegen *gen, + const struct tgsi_full_instruction *inst) +{ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_MOV: + return emit_MOV(gen, inst); + case TGSI_OPCODE_MUL: + return emit_MUL(gen, inst); + case TGSI_OPCODE_ADD: + return emit_ADD(gen, inst); + case TGSI_OPCODE_SGT: + return emit_SGT(gen, inst); + case TGSI_OPCODE_END: + return emit_END(gen); + + /* XXX lots more cases to do... */ + + default: + return false; + } + + return true; +} + + + +/** + * Emit "code" for a TGSI declaration. + * We only care about TGSI TEMPORARY register declarations at this time. + * For each TGSI TEMPORARY we allocate four SPE registers. + */ +static void +emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl) +{ + int i, ch; + + switch (decl->Declaration.File) { + case TGSI_FILE_TEMPORARY: +#if DISASSEM + printf("Declare temp reg %d .. %d\n", + decl->DeclarationRange.First, + decl->DeclarationRange.Last); +#endif + for (i = decl->DeclarationRange.First; + i <= decl->DeclarationRange.Last; + i++) { + for (ch = 0; ch < 4; ch++) { + gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); + } + + /* XXX if we run out of SPE registers, we need to spill + * to SPU memory. someday... + */ + +#if DISASSEM + printf(" SPE regs: %d %d %d %d\n", + gen->temp_regs[i][0], + gen->temp_regs[i][1], + gen->temp_regs[i][2], + gen->temp_regs[i][3]); +#endif + } + break; + default: + ; /* ignore */ + } +} + + +/** + * Translate TGSI shader code to SPE instructions. This is done when + * the state tracker gives us a new shader (via pipe->create_fs_state()). + * + * \param cell the rendering context (in) + * \param tokens the TGSI shader (in) + * \param f the generated function (out) + */ +boolean +cell_gen_fragment_program(struct cell_context *cell, + const struct tgsi_token *tokens, + struct spe_function *f) +{ + struct tgsi_parse_context parse; + struct codegen gen; + + memset(&gen, 0, sizeof(gen)); + gen.f = f; + + /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ + gen.inputs_reg = 3; /* pointer to inputs array */ + gen.outputs_reg = 4; /* pointer to outputs array */ + gen.constants_reg = 5; /* pointer to constants array */ + + spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); + spe_allocate_register(f, gen.inputs_reg); + spe_allocate_register(f, gen.outputs_reg); + spe_allocate_register(f, gen.constants_reg); + +#if DISASSEM + printf("Begin %s\n", __FUNCTION__); + tgsi_dump(tokens, 0); +#endif + + tgsi_parse_init(&parse, tokens); + + while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: +#if 0 + if (!note_immediate(&gen, &parse.FullToken.FullImmediate )) + goto fail; +#endif + break; + + case TGSI_TOKEN_TYPE_DECLARATION: + emit_declaration(&gen, &parse.FullToken.FullDeclaration); + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) { + gen.error = true; + } + break; + + default: + assert(0); + + } + } + + + if (gen.error) { + /* terminate the SPE code */ + return emit_END(&gen); + } + +#if DISASSEM + printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); + printf("End %s\n", __FUNCTION__); +#endif + + tgsi_parse_free( &parse ); + + return !gen.error; +} diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h new file mode 100644 index 0000000000..99faea7046 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h @@ -0,0 +1,42 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +#ifndef CELL_GEN_FP_H +#define CELL_GEN_FP_H + + + +extern boolean +cell_gen_fragment_program(struct cell_context *cell, + const struct tgsi_token *tokens, + struct spe_function *f); + + +#endif /* CELL_GEN_FP_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 3ebf0749ad..2da3097983 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -73,6 +73,22 @@ cell_emit_state(struct cell_context *cell) #endif } + if (cell->dirty & (CELL_NEW_FS)) { + /* Send new fragment program to SPUs */ + struct cell_command_fragment_program *fp + = cell_batch_alloc(cell, sizeof(*fp)); + fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM; + fp->num_inst = cell->fs->code.num_inst; + memcpy(&fp->code, cell->fs->code.store, + SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); + if (0) { + int i; + printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n"); + for (i = 0; i < fp->num_inst; i++) { + printf(" %3d: 0x%08x\n", i, fp->code[i]); + } + } + } if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL | diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c index 97e44eeb1a..3a0d066da2 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_shader.c +++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c @@ -34,7 +34,7 @@ #include "cell_context.h" #include "cell_state.h" - +#include "cell_gen_fp.h" /** cast wrapper */ @@ -61,7 +61,7 @@ static void * cell_create_fs_state(struct pipe_context *pipe, const struct pipe_shader_state *templ) { - /*struct cell_context *cell = cell_context(pipe);*/ + struct cell_context *cell = cell_context(pipe); struct cell_fragment_shader_state *cfs; cfs = CALLOC_STRUCT(cell_fragment_shader_state); @@ -76,6 +76,8 @@ cell_create_fs_state(struct pipe_context *pipe, tgsi_scan_shader(templ->tokens, &cfs->info); + cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code); + return cfs; } @@ -102,6 +104,8 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs) { struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs); + spe_release_func(&cfs->code); + FREE((void *) cfs->shader.tokens); FREE(cfs); } diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 2a7cb75f59..78260c4259 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -232,7 +232,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); - /* Copy state info */ + /* Copy state info (for fallback case only) */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); @@ -244,6 +244,21 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) } +static void +cmd_state_fragment_program(const struct cell_command_fragment_program *fp) +{ + if (Debug) + printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id); + /* Copy SPU code from batch buffer to spu buffer */ + memcpy(spu.fragment_program_code, fp->code, + SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); +#if 01 + /* Point function pointer at new code */ + spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code; +#endif +} + + static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { @@ -473,6 +488,14 @@ cmd_batch(uint opcode) pos += sizeof(*fops) / 8; } break; + case CELL_CMD_STATE_FRAGMENT_PROGRAM: + { + struct cell_command_fragment_program *fp + = (struct cell_command_fragment_program *) &buffer[pos]; + cmd_state_fragment_program(fp); + pos += sizeof(*fp) / 8; + } + break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index d40539da83..2c7b625840 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -75,6 +75,12 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y, vector float fragAlpha, vector unsigned int mask); +/** Function for running fragment program */ +typedef void (*spu_fragment_program_func)(vector float *inputs, + vector float *outputs, + vector float *constants); + + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ @@ -142,9 +148,18 @@ struct spu_global /** Current fragment ops function */ spu_fragment_ops_func fragment_ops; + /** Current fragment program machine code */ + uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; + /** Current fragment ops function */ + spu_fragment_program_func fragment_program; + /** Current texture sampler function */ spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS]; + /** Fragment program constants (XXX preliminary/used) */ +#define MAX_CONSTANTS 32 + vector float constants[MAX_CONSTANTS]; + } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index f02cdd1f76..8b93878192 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -314,7 +314,42 @@ emit_quad( int x, int y, mask_t mask ) } else { /* simple shading */ +#if 0 eval_coeff(1, (float) x, (float) y, colors); + +#else + /* XXX new fragment program code */ + + if (spu.fragment_program) { + vector float inputs[4*4], outputs[2*4]; + + /* setup inputs */ + eval_coeff(1, (float) x, (float) y, inputs); + + /* Execute the current fragment program */ + spu.fragment_program(inputs, outputs, spu.constants); + + /* Copy outputs */ + colors[0] = outputs[0*4+0]; + colors[1] = outputs[0*4+1]; + colors[2] = outputs[0*4+2]; + colors[3] = outputs[0*4+3]; + + if (0 && spu.init.id==0 && y == 48) { + printf("colors[0] = %f %f %f %f\n", + spu_extract(colors[0], 0), + spu_extract(colors[0], 1), + spu_extract(colors[0], 2), + spu_extract(colors[0], 3)); + printf("colors[1] = %f %f %f %f\n", + spu_extract(colors[1], 0), + spu_extract(colors[1], 1), + spu_extract(colors[1], 2), + spu_extract(colors[1], 3)); + } + + } +#endif } -- cgit v1.2.3 From 32250eb959b1355b2f6984ea892a86a6ecf9d3c3 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 15 Sep 2008 19:38:39 -0600 Subject: cell: export CELL_DEBUG=asm to dump SPU assembly code --- src/gallium/drivers/cell/common.h | 3 +- src/gallium/drivers/cell/ppu/cell_context.c | 1 + src/gallium/drivers/cell/ppu/cell_gen_fp.c | 56 ++++++++++++++--------------- 3 files changed, 31 insertions(+), 29 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index cb0631baf5..8f08854117 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -106,7 +106,8 @@ #define CELL_DEBUG_CHECKER (1 << 0) -#define CELL_DEBUG_SYNC (1 << 1) +#define CELL_DEBUG_ASM (1 << 1) +#define CELL_DEBUG_SYNC (1 << 2) diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 0a5c0baa47..b418271dca 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -87,6 +87,7 @@ cell_draw_create(struct cell_context *cell) static const struct debug_named_value cell_debug_flags[] = { {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */ + {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */ {"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */ {NULL, 0} }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index 7a672478c5..98ee5af279 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -51,10 +51,6 @@ #include "cell_gen_fp.h" -/** Set to 1 to enable debug/disassembly printfs */ -#define DISASSEM 0 - - #define MAX_TEMPS 16 #define MAX_IMMED 8 @@ -864,6 +860,8 @@ emit_instruction(struct codegen *gen, /* XXX lots more cases to do... */ default: + fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n", + inst->Instruction.Opcode); return false; } @@ -914,17 +912,19 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) * For each TGSI TEMPORARY we allocate four SPE registers. */ static boolean -emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl) +emit_declaration(struct cell_context *cell, + struct codegen *gen, const struct tgsi_full_declaration *decl) { int i, ch; switch (decl->Declaration.File) { case TGSI_FILE_TEMPORARY: -#if DISASSEM - printf("Declare temp reg %d .. %d\n", - decl->DeclarationRange.First, - decl->DeclarationRange.Last); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf("Declare temp reg %d .. %d\n", + decl->DeclarationRange.First, + decl->DeclarationRange.Last); + } + for (i = decl->DeclarationRange.First; i <= decl->DeclarationRange.Last; i++) { @@ -939,13 +939,13 @@ emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl) * to SPU memory. someday... */ -#if DISASSEM - printf(" SPE regs: %d %d %d %d\n", - gen->temp_regs[i][0], - gen->temp_regs[i][1], - gen->temp_regs[i][2], - gen->temp_regs[i][3]); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf(" SPE regs: %d %d %d %d\n", + gen->temp_regs[i][0], + gen->temp_regs[i][1], + gen->temp_regs[i][2], + gen->temp_regs[i][3]); + } } break; default: @@ -985,12 +985,12 @@ cell_gen_fragment_program(struct cell_context *cell, spe_allocate_register(f, gen.outputs_reg); spe_allocate_register(f, gen.constants_reg); -#if DISASSEM - spe_print_code(f, true); - spe_indent(f, 8); - printf("Begin %s\n", __FUNCTION__); - tgsi_dump(tokens, 0); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_print_code(f, true); + spe_indent(f, 8); + printf("Begin %s\n", __FUNCTION__); + tgsi_dump(tokens, 0); + } tgsi_parse_init(&parse, tokens); @@ -1004,7 +1004,7 @@ cell_gen_fragment_program(struct cell_context *cell, break; case TGSI_TOKEN_TYPE_DECLARATION: - if (!emit_declaration(&gen, &parse.FullToken.FullDeclaration)) + if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration)) gen.error = true; break; @@ -1024,10 +1024,10 @@ cell_gen_fragment_program(struct cell_context *cell, return emit_END(&gen); } -#if DISASSEM - printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); - printf("End %s\n", __FUNCTION__); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); + printf("End %s\n", __FUNCTION__); + } tgsi_parse_free( &parse ); -- cgit v1.2.3 From 858ced051551aa5d0ddd41936253d3a4ee5c142f Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Wed, 17 Sep 2008 02:30:20 -0600 Subject: CELL: fleshing out the blending fragment ops - Added two new debug flags (to be used with the CELL_DEBUG environment variable). The first, "CELL_DEBUG=fragops", activates SPE fragment ops debug messages. The second, "CELL_DEBUG=fragopfallback", will eventually be used to disable the use of generated SPE code for fragment ops in favor of the default fallback reference routine. (During development, though, the parity of this flag is reversed: all users will get the reference code *unless* CELL_DEBUG=fragopfallback is set. This will prevent hiccups in code generation from affecting the other developers.) - Formalized debug message usage and macros in spu/spu_main.c. - Added lots of new code to ppu/cell_gen_fragment.c to extend the number of supported source RGB factors from 4 to 15, and to complete the list of supported blend equations. More coming, to complete the source and destination RGB and alpha factors, and to complete the rest of the fragment operations... --- src/gallium/drivers/cell/common.h | 11 +- src/gallium/drivers/cell/ppu/cell_context.c | 2 + src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 272 ++++++++++++++++++++++- src/gallium/drivers/cell/ppu/cell_state_emit.c | 5 + src/gallium/drivers/cell/spu/spu_main.c | 115 +++++----- 5 files changed, 337 insertions(+), 68 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 8f08854117..f0ff96eb47 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -104,12 +104,11 @@ #define CELL_BUFFER_STATUS_FREE 10 #define CELL_BUFFER_STATUS_USED 20 - -#define CELL_DEBUG_CHECKER (1 << 0) -#define CELL_DEBUG_ASM (1 << 1) -#define CELL_DEBUG_SYNC (1 << 2) - - +#define CELL_DEBUG_CHECKER (1 << 0) +#define CELL_DEBUG_ASM (1 << 1) +#define CELL_DEBUG_SYNC (1 << 2) +#define CELL_DEBUG_FRAGMENT_OPS (1 << 3) +#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4) /** Max instructions for doing per-fragment operations */ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index b418271dca..62e213ea35 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -89,6 +89,8 @@ static const struct debug_named_value cell_debug_flags[] = { {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */ {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */ {"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */ + {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/ + {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/ {NULL, 0} }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 06219d4e98..2c8c9e0d2c 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -229,7 +229,36 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, spe_release_register(f, amask_reg); } +/* This is a convenient and oft-used sequence. It chooses + * the smaller of each element of reg1 and reg2, and combines them + * into the result register, as follows: + * + * The Float Compare Greater Than (fcgt) instruction will put + * 1s into compare_reg where reg1 > reg2, and 0s where reg1 <= reg2. + * + * Then the Select Bits (selb) instruction will take bits from + * reg1 where compare_reg is 0, and from reg2 where compare_reg is + * 1. Ergo, result_reg will have the bits from reg1 where reg1 <= reg2, + * and the bits from reg2 where reg1 > reg2, which is exactly the + * MIN operation. + */ +#define FLOAT_VECTOR_MIN(f, result_reg, reg1, reg2) {\ + int compare_reg = spe_allocate_available_register(f); \ + spe_fcgt(f, compare_reg, reg1, reg2); \ + spe_selb(f, result_reg, reg1, reg2, compare_reg); \ + spe_release_register(f, compare_reg); \ +} +/* The FLOAT_VECTOR_MAX sequence is similar to the FLOAT_VECTOR_MIN + * sequence above, except that the registers specified when selecting + * bits are reversed. + */ +#define FLOAT_VECTOR_MAX(f, result_reg, reg1, reg2) {\ + int compare_reg = spe_allocate_available_register(f); \ + spe_fcgt(f, compare_reg, reg1, reg2); \ + spe_selb(f, result_reg, reg2, reg1, compare_reg); \ + spe_release_register(f, compare_reg); \ +} /** * Generate SPE code to implement the given blend mode for a quad of pixels. @@ -242,6 +271,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, */ static void gen_blend(const struct pipe_blend_state *blend, + const struct pipe_blend_color *blend_color, struct spe_function *f, enum pipe_format color_format, int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg, @@ -262,10 +292,53 @@ gen_blend(const struct pipe_blend_state *blend, int fbB_reg = spe_allocate_available_register(f); int fbA_reg = spe_allocate_available_register(f); - int one_reg = spe_allocate_available_register(f); int tmp_reg = spe_allocate_available_register(f); - boolean one_reg_set = false; /* avoid setting one_reg more than once */ + /* These values might or might not eventually get put into + * registers. We avoid allocating them and setting them until + * they're actually needed; then we avoid setting them more than + * once, and release them at the end of code generation. + */ + boolean one_reg_set = false; + int one_reg; +#define SET_ONE_REG_IF_UNSET(f) if (!one_reg_set) {\ + one_reg = spe_allocate_available_register(f); \ + spe_load_float(f, one_reg, 1.0f); \ + one_reg_set = true; \ +} +#define RELEASE_ONE_REG_IF_USED(f) if (one_reg_set) {\ + spe_release_register(f, one_reg); \ +} + + boolean const_color_set = false; + int constR_reg, constG_reg, constB_reg; +#define SET_CONST_COLOR_IF_UNSET(f, blend_color) if (!const_color_set) {\ + constR_reg = spe_allocate_available_register(f); \ + constG_reg = spe_allocate_available_register(f); \ + constG_reg = spe_allocate_available_register(f); \ + spe_load_float(f, constR_reg, blend_color->color[0]); \ + spe_load_float(f, constG_reg, blend_color->color[1]); \ + spe_load_float(f, constB_reg, blend_color->color[2]); \ + const_color_set = true;\ +} +#define RELEASE_CONST_COLOR_IF_USED(f) if (const_color_set) {\ + spe_release_register(f, constR_reg); \ + spe_release_register(f, constG_reg); \ + spe_release_register(f, constB_reg); \ +} + + boolean const_alpha_set = false; + int constA_reg; +#define SET_CONST_ALPHA_IF_UNSET(f, blend_color) if (!const_alpha_set) {\ + constA_reg = spe_allocate_available_register(f); \ + spe_load_float(f, constA_reg, blend_color->color[3]); \ + const_alpha_set = true; \ +} +#define RELEASE_CONST_ALPHA_IF_USED(f) if (const_alpha_set) {\ + spe_release_register(f, constA_reg); \ +} + + /* Real code starts here */ ASSERT(blend->blend_enable); @@ -348,30 +421,161 @@ gen_blend(const struct pipe_blend_state *blend, /* - * Compute Src RGB terms + * Compute Src RGB terms. We're actually looking for the value + * of (the appropriate RGB factors) * (the incoming source RGB color). */ switch (blend->rgb_src_factor) { case PIPE_BLENDFACTOR_ONE: + /* factors = (1,1,1), so term = (R,G,B) */ spe_move(f, term1R_reg, fragR_reg); spe_move(f, term1G_reg, fragG_reg); spe_move(f, term1B_reg, fragB_reg); break; case PIPE_BLENDFACTOR_ZERO: - spe_zero(f, term1R_reg); - spe_zero(f, term1G_reg); - spe_zero(f, term1B_reg); + /* factors = (0,0,0), so term = (0,0,0) */ + spe_load_float(f, term1R_reg, 0.0f); + spe_load_float(f, term1G_reg, 0.0f); + spe_load_float(f, term1B_reg, 0.0f); break; case PIPE_BLENDFACTOR_SRC_COLOR: + /* factors = (R,G,B), so term = (R*R, G*G, B*B) */ spe_fm(f, term1R_reg, fragR_reg, fragR_reg); spe_fm(f, term1G_reg, fragG_reg, fragG_reg); spe_fm(f, term1B_reg, fragB_reg, fragB_reg); break; case PIPE_BLENDFACTOR_SRC_ALPHA: + /* factors = (A,A,A), so term = (R*A, G*A, B*A) */ spe_fm(f, term1R_reg, fragR_reg, fragA_reg); spe_fm(f, term1G_reg, fragG_reg, fragA_reg); spe_fm(f, term1B_reg, fragB_reg, fragA_reg); break; - /* XXX more cases */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) */ + /* we'll need the optional constant {1,1,1,1} register */ + SET_ONE_REG_IF_UNSET(f) + /* tmp = 1 - R */ + spe_fs(f, tmp_reg, one_reg, fragR_reg); + /* term = R * tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + /* repeat for G and B */ + spe_fs(f, tmp_reg, one_reg, fragG_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fs(f, tmp_reg, one_reg, fragB_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */ + spe_fm(f, term1R_reg, fragR_reg, fbR_reg); + spe_fm(f, term1G_reg, fragG_reg, fbG_reg); + spe_fm(f, term1B_reg, fragB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) */ + /* we'll need the optional constant {1,1,1,1} register */ + SET_ONE_REG_IF_UNSET(f) + /* tmp = 1 - Rfb */ + spe_fs(f, tmp_reg, one_reg, fbR_reg); + /* term = R * tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + /* repeat for G and B */ + spe_fs(f, tmp_reg, one_reg, fbG_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fs(f, tmp_reg, one_reg, fbB_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) */ + /* we'll need the optional constant {1,1,1,1} register */ + SET_ONE_REG_IF_UNSET(f) + /* tmp = 1 - A */ + spe_fs(f, tmp_reg, one_reg, fragA_reg); + /* term = R * tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + /* repeat for G and B with the same (1-A) factor */ + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */ + spe_fm(f, term1R_reg, fragR_reg, fbA_reg); + spe_fm(f, term1G_reg, fragG_reg, fbA_reg); + spe_fm(f, term1B_reg, fragB_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) */ + /* we'll need the optional constant {1,1,1,1} register */ + SET_ONE_REG_IF_UNSET(f) + /* tmp = 1 - A */ + spe_fs(f, tmp_reg, one_reg, fbA_reg); + /* term = R * tmp, G*tmp, and B*tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We'll need the optional blend color registers */ + SET_CONST_COLOR_IF_UNSET(f,blend_color) + /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */ + spe_fm(f, term1R_reg, fragR_reg, constR_reg); + spe_fm(f, term1G_reg, fragG_reg, constG_reg); + spe_fm(f, term1B_reg, fragB_reg, constB_reg); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + /* we'll need the optional constant alpha register */ + SET_CONST_ALPHA_IF_UNSET(f, blend_color) + /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */ + spe_fm(f, term1R_reg, fragR_reg, constA_reg); + spe_fm(f, term1G_reg, fragG_reg, constA_reg); + spe_fm(f, term1B_reg, fragB_reg, constA_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need both the optional {1,1,1,1} register, and the optional + * constant color registers + */ + SET_ONE_REG_IF_UNSET(f) + SET_CONST_COLOR_IF_UNSET(f, blend_color) + /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) */ + spe_fs(f, tmp_reg, one_reg, constR_reg); + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + spe_fs(f, tmp_reg, one_reg, constG_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fs(f, tmp_reg, one_reg, constB_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + /* We need the optional {1,1,1,1} register and the optional + * constant alpha register + */ + SET_ONE_REG_IF_UNSET(f) + SET_CONST_ALPHA_IF_UNSET(f, blend_color) + /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) */ + spe_fs(f, tmp_reg, one_reg, constA_reg); + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + /* We'll need the optional {1,1,1,1} register */ + SET_ONE_REG_IF_UNSET(f) + /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so + * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb)) + */ + /* tmp = 1 - Afb */ + spe_fs(f, tmp_reg, one_reg, fbA_reg); + /* tmp = min(A,tmp) */ + FLOAT_VECTOR_MIN(f, tmp_reg, fragA_reg, tmp_reg) + /* term = R*tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + + /* non-OpenGL cases? */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: ASSERT(0); } @@ -421,6 +625,7 @@ gen_blend(const struct pipe_blend_state *blend, case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* one = {1.0, 1.0, 1.0, 1.0} */ if (!one_reg_set) { + one_reg = spe_allocate_available_register(f); spe_load_float(f, one_reg, 1.0f); one_reg_set = true; } @@ -432,6 +637,14 @@ gen_blend(const struct pipe_blend_state *blend, spe_fm(f, term2B_reg, fbB_reg, tmp_reg); break; /* XXX more cases */ + // GL_ONE_MINUS_SRC_COLOR + // GL_DST_COLOR + // GL_ONE_MINUS_DST_COLOR + // GL_DST_ALPHA + // GL_CONSTANT_COLOR + // GL_ONE_MINUS_CONSTANT_COLOR + // GL_CONSTANT_ALPHA + // GL_ONE_MINUS_CONSTANT_ALPHA default: ASSERT(0); } @@ -452,6 +665,7 @@ gen_blend(const struct pipe_blend_state *blend, case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* one = {1.0, 1.0, 1.0, 1.0} */ if (!one_reg_set) { + one_reg = spe_allocate_available_register(f); spe_load_float(f, one_reg, 1.0f); one_reg_set = true; } @@ -461,6 +675,14 @@ gen_blend(const struct pipe_blend_state *blend, spe_fm(f, term2A_reg, fbA_reg, tmp_reg); break; /* XXX more cases */ + // GL_ONE_MINUS_SRC_COLOR + // GL_DST_COLOR + // GL_ONE_MINUS_DST_COLOR + // GL_DST_ALPHA + // GL_CONSTANT_COLOR + // GL_ONE_MINUS_CONSTANT_COLOR + // GL_CONSTANT_ALPHA + // GL_ONE_MINUS_CONSTANT_ALPHA default: ASSERT(0); } @@ -479,7 +701,21 @@ gen_blend(const struct pipe_blend_state *blend, spe_fs(f, fragG_reg, term1G_reg, term2G_reg); spe_fs(f, fragB_reg, term1B_reg, term2B_reg); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragR_reg, term2R_reg, term1R_reg); + spe_fs(f, fragG_reg, term2G_reg, term1G_reg); + spe_fs(f, fragB_reg, term2B_reg, term1B_reg); + break; + case PIPE_BLEND_MIN: + FLOAT_VECTOR_MIN(f, fragR_reg, term1R_reg, term2R_reg) + FLOAT_VECTOR_MIN(f, fragG_reg, term1G_reg, term2G_reg) + FLOAT_VECTOR_MIN(f, fragB_reg, term1B_reg, term2B_reg) + break; + case PIPE_BLEND_MAX: + FLOAT_VECTOR_MAX(f, fragR_reg, term1R_reg, term2R_reg) + FLOAT_VECTOR_MAX(f, fragG_reg, term1G_reg, term2G_reg) + FLOAT_VECTOR_MAX(f, fragB_reg, term1B_reg, term2B_reg) + break; default: ASSERT(0); } @@ -494,7 +730,15 @@ gen_blend(const struct pipe_blend_state *blend, case PIPE_BLEND_SUBTRACT: spe_fs(f, fragA_reg, term1A_reg, term2A_reg); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragA_reg, term2A_reg, term1A_reg); + break; + case PIPE_BLEND_MIN: + FLOAT_VECTOR_MIN(f, fragA_reg, term1A_reg, term2A_reg) + break; + case PIPE_BLEND_MAX: + FLOAT_VECTOR_MAX(f, fragA_reg, term1A_reg, term2A_reg) + break; default: ASSERT(0); } @@ -514,8 +758,12 @@ gen_blend(const struct pipe_blend_state *blend, spe_release_register(f, fbB_reg); spe_release_register(f, fbA_reg); - spe_release_register(f, one_reg); spe_release_register(f, tmp_reg); + + /* Free any optional registers that actually got used */ + RELEASE_ONE_REG_IF_USED(f) + RELEASE_CONST_COLOR_IF_USED(f) + RELEASE_CONST_ALPHA_IF_USED(f) } @@ -629,6 +877,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) const struct pipe_depth_stencil_alpha_state *dsa = &cell->depth_stencil->base; const struct pipe_blend_state *blend = &cell->blend->base; + const struct pipe_blend_color *blend_color = &cell->blend_color; const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ @@ -651,7 +900,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */ int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */ - spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); spe_allocate_register(f, x_reg); spe_allocate_register(f, y_reg); spe_allocate_register(f, color_tile_reg); @@ -816,7 +1064,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) if (blend->blend_enable) { - gen_blend(blend, f, color_format, + gen_blend(blend, blend_color, f, color_format, fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg); } diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 2da3097983..8a389cd6aa 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -100,14 +100,19 @@ cell_emit_state(struct cell_context *cell) = cell_batch_alloc(cell, sizeof(*fops)); struct spe_function spe_code; + /* Prepare the buffer that will hold the generated code. */ + spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + /* generate new code */ cell_gen_fragment_function(cell, &spe_code); + /* put the new code into the batch buffer */ fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; memcpy(&fops->code, spe_code.store, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); fops->dsa = cell->depth_stencil->base; fops->blend = cell->blend->base; + /* free codegen buffer */ spe_release_func(&spe_code); } diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 78260c4259..da2cb08972 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -50,7 +50,31 @@ helpful headers: /opt/cell/sdk/usr/include/libmisc.h */ +/* Set to 0 to disable all extraneous debugging code */ +#define DEBUG 1 + +#if DEBUG boolean Debug = FALSE; +boolean force_fragment_ops_fallback = TRUE; + +/* These debug macros use the unusual construction ", ##__VA_ARGS__" + * which expands to the expected comma + args if variadic arguments + * are supplied, but swallows the comma if there are no variadic + * arguments (which avoids syntax errors that would otherwise occur). + */ +#define DEBUG_PRINTF(format,...) \ + if (Debug) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) +#define D_PRINTF(flag, format,...) \ + if (spu.init.debug_flags & (flag)) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) + +#else + +#define DEBUG_PRINTF(...) +#define D_PRINTF(...) + +#endif struct spu_global spu; @@ -133,9 +157,7 @@ really_clear_tiles(uint surfaceIndex) static void cmd_clear_surface(const struct cell_command_clear_surface *clear) { - if (Debug) - printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id, - clear->surface, clear->value); + DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); if (clear->surface == 0) { spu.fb.color_clear_value = clear->value; @@ -203,17 +225,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) #endif /* CLEAR_OPT */ - if (Debug) - printf("SPU %u: CLEAR SURF done\n", spu.init.id); + DEBUG_PRINTF("CLEAR SURF done\n"); } static void cmd_release_verts(const struct cell_command_release_verts *release) { - if (Debug) - printf("SPU %u: RELEASE VERTS %u\n", - spu.init.id, release->vertex_buf); + DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf); ASSERT(release->vertex_buf != ~0U); release_buffer(release->vertex_buf); } @@ -228,16 +247,30 @@ cmd_release_verts(const struct cell_command_release_verts *release) static void cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) { - if (Debug) - printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); + DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); /* Copy state info (for fallback case only) */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); - /* Point function pointer at new code */ - spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; + /* Parity twist! For now, always use the fallback code by default, + * only switching to codegen when specifically requested. This + * allows us to develop freely without risking taking down the + * branch. + * + * Later, the parity of this check will be reversed, so that + * codegen is *always* used, unless we specifically indicate that + * we don't want it. + * + * Eventually, the option will be removed completely, because in + * final code we'll always use codegen and won't even provide the + * raw state records that the fallback code requires. + */ + if (spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) { + spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; + } + /* otherwise, the default fallback code remains in place */ spu.read_depth = spu.depth_stencil_alpha.depth.enabled; spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; @@ -247,8 +280,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) static void cmd_state_fragment_program(const struct cell_command_fragment_program *fp) { - if (Debug) - printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id); + DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_program_code, fp->code, SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); @@ -262,9 +294,7 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp) static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { - if (Debug) - printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", - spu.init.id, + DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", cmd->width, cmd->height, cmd->color_start, @@ -309,9 +339,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) static void cmd_state_sampler(const struct cell_command_sampler *sampler) { - if (Debug) - printf("SPU %u: SAMPLER [%u]\n", - spu.init.id, sampler->unit); + DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit); spu.sampler[sampler->unit] = sampler->state; if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) @@ -328,11 +356,9 @@ cmd_state_texture(const struct cell_command_texture *texture) const uint width = texture->width; const uint height = texture->height; - if (Debug) { - printf("SPU %u: TEXTURE [%u] at %p size %u x %u\n", spu.init.id, + DEBUG_PRINTF("TEXTURE [%u] at %p size %u x %u\n", texture->unit, texture->start, texture->width, texture->height); - } spu.texture[unit].start = texture->start; spu.texture[unit].width = width; @@ -351,10 +377,7 @@ cmd_state_texture(const struct cell_command_texture *texture) static void cmd_state_vertex_info(const struct vertex_info *vinfo) { - if (Debug) { - printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id, - vinfo->num_attribs); - } + DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); ASSERT(vinfo->num_attribs >= 1); ASSERT(vinfo->num_attribs <= 8); memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); @@ -393,8 +416,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code) static void cmd_finish(void) { - if (Debug) - printf("SPU %u: FINISH\n", spu.init.id); + DEBUG_PRINTF("FINISH\n"); really_clear_tiles(0); /* wait for all outstanding DMAs to finish */ mfc_write_tag_mask(~0); @@ -419,9 +441,8 @@ cmd_batch(uint opcode) const unsigned usize = size / sizeof(buffer[0]); uint pos; - if (Debug) - printf("SPU %u: BATCH buffer %u, len %u, from %p\n", - spu.init.id, buf, size, spu.init.buffers[buf]); + DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n", + buf, size, spu.init.buffers[buf]); ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); @@ -440,8 +461,7 @@ cmd_batch(uint opcode) wait_on_mask(1 << TAG_BATCH_BUFFER); /* Tell PPU we're done copying the buffer to local store */ - if (Debug) - printf("SPU %u: release batch buf %u\n", spu.init.id, buf); + DEBUG_PRINTF("release batch buf %u\n", buf); release_buffer(buf); /* @@ -571,8 +591,7 @@ cmd_batch(uint opcode) } } - if (Debug) - printf("SPU %u: BATCH complete\n", spu.init.id); + DEBUG_PRINTF("BATCH complete\n"); } @@ -585,8 +604,7 @@ main_loop(void) struct cell_command cmd; int exitFlag = 0; - if (Debug) - printf("SPU %u: Enter main loop\n", spu.init.id); + DEBUG_PRINTF("Enter main loop\n"); ASSERT((sizeof(struct cell_command) & 0xf) == 0); ASSERT_ALIGN16(&cmd); @@ -595,14 +613,12 @@ main_loop(void) unsigned opcode; int tag = 0; - if (Debug) - printf("SPU %u: Wait for cmd...\n", spu.init.id); + DEBUG_PRINTF("Wait for cmd...\n"); /* read/wait from mailbox */ opcode = (unsigned int) spu_read_in_mbox(); - if (Debug) - printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode); + DEBUG_PRINTF("got cmd 0x%x\n", opcode); /* command payload */ mfc_get(&cmd, /* dest */ @@ -619,8 +635,7 @@ main_loop(void) switch (opcode & CELL_CMD_OPCODE_MASK) { case CELL_CMD_EXIT: - if (Debug) - printf("SPU %u: EXIT\n", spu.init.id); + DEBUG_PRINTF("EXIT\n"); exitFlag = 1; break; case CELL_CMD_VS_EXECUTE: @@ -632,13 +647,12 @@ main_loop(void) cmd_batch(opcode); break; default: - printf("Bad opcode!\n"); + printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK); } } - if (Debug) - printf("SPU %u: Exit main loop\n", spu.init.id); + DEBUG_PRINTF("Exit main loop\n"); spu_dcache_report(); } @@ -653,7 +667,8 @@ one_time_init(void) invalidate_tex_cache(); /* Install default/fallback fragment processing function. - * This will normally be overriden by a code-gen'd function. + * This will normally be overriden by a code-gen'd function + * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set. */ spu.fragment_ops = spu_fallback_fragment_ops; } @@ -685,8 +700,8 @@ main(main_param_t speid, main_param_t argp) one_time_init(); - if (Debug) - printf("SPU: main() speid=%lu\n", (unsigned long) speid); + DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid); + D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n"); mfc_get(&spu.init, /* dest */ (unsigned int) argp, /* src */ -- cgit v1.2.3 From 164fb1299e1614ce05ae539d832567469eedb402 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Fri, 26 Sep 2008 09:38:40 -0600 Subject: cell: checkpoint: support for function calls in SPU shaders Will be used for instructions like SIN/COS/POW/TEX/etc. The PPU needs to know the address of some functions in the SPU address space. Send that info to the PPU/main memory rather than patch up shaders on the SPU side. Not finished/tested yet... --- src/gallium/drivers/cell/common.h | 18 ++++- src/gallium/drivers/cell/ppu/cell_context.h | 1 + src/gallium/drivers/cell/ppu/cell_gen_fp.c | 81 ++++++++++++++++++++- src/gallium/drivers/cell/ppu/cell_spu.c | 8 +++ src/gallium/drivers/cell/spu/Makefile | 3 +- src/gallium/drivers/cell/spu/spu_funcs.c | 106 ++++++++++++++++++++++++++++ src/gallium/drivers/cell/spu/spu_funcs.h | 35 +++++++++ src/gallium/drivers/cell/spu/spu_main.c | 5 ++ 8 files changed, 254 insertions(+), 3 deletions(-) create mode 100644 src/gallium/drivers/cell/spu/spu_funcs.c create mode 100644 src/gallium/drivers/cell/spu/spu_funcs.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index f0ff96eb47..99329fd8e2 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -130,7 +130,7 @@ struct cell_command_fragment_ops #define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128 /** - * Command to send a fragment progra to SPUs. + * Command to send a fragment program to SPUs. */ struct cell_command_fragment_program { @@ -267,6 +267,20 @@ struct cell_command } ALIGN16_ATTRIB; +#define MAX_SPU_FUNCTIONS 12 +/** + * Used to tell the PPU about the address of particular functions in the + * SPU's address space. + */ +struct cell_spu_function_info +{ + uint num; + char names[MAX_SPU_FUNCTIONS][16]; + uint addrs[MAX_SPU_FUNCTIONS]; + char pad[12]; /**< Pad struct to multiple of 16 bytes (256 currently) */ +}; + + /** This is the object passed to spe_create_thread() */ struct cell_init_info { @@ -278,6 +292,8 @@ struct cell_init_info /** Buffers for command batches, vertex/index data */ ubyte *buffers[CELL_NUM_BUFFERS]; uint *buffer_status; /**< points at cell_context->buffer_status */ + + struct cell_spu_function_info *spu_functions; } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 14914b9c6f..a9ad84bb18 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -149,6 +149,7 @@ struct cell_context /** Mapped constant buffers */ void *mapped_constants[PIPE_SHADER_TYPES]; + struct cell_spu_function_info spu_functions ALIGN16_ATTRIB; uint num_spus; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index 8972b5b1ea..fd12af19ce 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -37,7 +37,7 @@ * \author Brian Paul */ - +#include #include "pipe/p_defines.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" @@ -64,6 +64,7 @@ */ struct codegen { + struct cell_context *cell; int inputs_reg; /**< 1st function parameter */ int outputs_reg; /**< 2nd function parameter */ int constants_reg; /**< 3rd function parameter */ @@ -1076,6 +1077,76 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) } +#if 0 +static void +print_functions(struct cell_context *cell) +{ + struct cell_spu_function_info *funcs = &cell->spu_functions; + uint i; + for (i = 0; i < funcs->num; i++) { + printf("SPU func %u: %s at %u\n", + i, funcs->names[i], funcs->addrs[i]); + } +} +#endif + + +/** + * Emit code to call a SPU function. + * Used to implement instructions like SIN/COS/POW/TEX/etc. + */ +static boolean +emit_function_call(struct codegen *gen, + const struct tgsi_full_instruction *inst, + char *funcname, uint num_args) +{ + const struct cell_spu_function_info *funcs = &gen->cell->spu_functions; + char comment[100]; + uint addr; + int ch; + + assert(num_args <= 2); + + /* lookup function address */ + { + uint i; + addr = 0; + for (i = 0; i < funcs->num; i++) { + if (strcmp(funcs->names[i], funcname) == 0) { + addr = funcs->addrs[i]; + } + } + assert(addr && "spu function not found"); + } + + sprintf(comment, "CALL %s:", funcname); + spe_comment(gen->f, -4, comment); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int s_regs[3]; + uint a; + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + } + + /* XXX not done */ + (void) s_regs; + (void) d_reg; + + spe_bisl(gen->f, SPE_REG_RA, addr, 0, 0); /* XXX untested! */ + + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + + /** * Emit max. See emit_SGT for comments. */ @@ -1303,6 +1374,13 @@ emit_instruction(struct codegen *gen, case TGSI_OPCODE_END: return emit_END(gen); + case TGSI_OPCODE_COS: + return emit_function_call(gen, inst, "spu_cos", 1); + case TGSI_OPCODE_SIN: + return emit_function_call(gen, inst, "spu_sin", 1); + case TGSI_OPCODE_POW: + return emit_function_call(gen, inst, "spu_pow", 2); + case TGSI_OPCODE_IF: return emit_IF(gen, inst); case TGSI_OPCODE_ELSE: @@ -1431,6 +1509,7 @@ cell_gen_fragment_program(struct cell_context *cell, struct codegen gen; memset(&gen, 0, sizeof(gen)); + gen.cell = cell; gen.f = f; /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c index 9508227e29..df020c4146 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.c +++ b/src/gallium/drivers/cell/ppu/cell_spu.c @@ -36,6 +36,7 @@ #include "cell_spu.h" #include "pipe/p_format.h" #include "pipe/p_state.h" +#include "util/u_memory.h" #include "cell/common.h" @@ -131,6 +132,11 @@ cell_start_spus(struct cell_context *cell) ASSERT_ALIGN16(&cell_global.inits[0]); ASSERT_ALIGN16(&cell_global.inits[1]); + /* + * Initialize the global 'inits' structure for each SPU. + * A pointer to the init struct will be passed to each SPU. + * The SPUs will then each grab their init info with mfc_get(). + */ for (i = 0; i < cell->num_spus; i++) { cell_global.inits[i].id = i; cell_global.inits[i].num_spus = cell->num_spus; @@ -141,6 +147,8 @@ cell_start_spus(struct cell_context *cell) } cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0]; + cell_global.inits[i].spu_functions = &cell->spu_functions; + cell_global.spe_contexts[i] = spe_context_create(0, NULL); if (!cell_global.spe_contexts[i]) { fprintf(stderr, "spe_context_create() failed\n"); diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index 1ae0dfb8c1..c2db85247e 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -16,8 +16,9 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o SOURCES = \ - spu_main.c \ + spu_funcs.c \ spu_dcache.c \ + spu_main.c \ spu_per_fragment_op.c \ spu_render.c \ spu_texture.c \ diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c new file mode 100644 index 0000000000..d174956518 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -0,0 +1,106 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * SPU functions accessed by shaders. + * + * Authors: Brian Paul + */ + + +#include +#include +#include +#include + +#include "cell/common.h" +#include "spu_main.h" +#include "spu_funcs.h" + + +#define M_PI 3.1415926 + + +static vector float +spu_cos(vector float x) +{ + static const float scale = 1.0 / (2.0 * M_PI); + x = x * spu_splats(scale); /* normalize */ + return _cos8_v(x); +} + +static vector float +spu_sin(vector float x) +{ + static const float scale = 1.0 / (2.0 * M_PI); + x = x * spu_splats(scale); /* normalize */ + return _sin8_v(x); /* 8-bit accuracy enough?? */ +} + + +static void +add_func(struct cell_spu_function_info *spu_functions, + const char *name, void *addr) +{ + uint n = spu_functions->num; + ASSERT(strlen(name) < 16); + strcpy(spu_functions->names[n], name); + spu_functions->addrs[n] = (uint) addr; + spu_functions->num++; +} + + +/** + * Return info about the SPU's function to the PPU / main memory. + * The PPU needs to know the address of some SPU-side functions so + * that we can generate shader code with function calls. + */ +void +return_function_info(void) +{ + struct cell_spu_function_info funcs ALIGN16_ATTRIB; + int tag = TAG_MISC; + + ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */ + + funcs.num = 0; + add_func(&funcs, "spu_cos", &spu_cos); + add_func(&funcs, "spu_sin", &spu_sin); + + /* Send the function info back to the PPU / main memory */ + mfc_put((void *) &funcs, /* src in local store */ + (unsigned int) spu.init.spu_functions, /* dst in main memory */ + sizeof(funcs), /* bytes */ + tag, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << tag); +} + + + diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h new file mode 100644 index 0000000000..3adb6ae99f --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_funcs.h @@ -0,0 +1,35 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_FUNCS_H +#define SPU_FUNCS_H + +extern void +return_function_info(void); + +#endif + diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index b4d30228f7..6ef65d5645 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -32,6 +32,7 @@ #include #include +#include "spu_funcs.h" #include "spu_main.h" #include "spu_render.h" #include "spu_per_fragment_op.h" @@ -721,6 +722,10 @@ main(main_param_t speid, main_param_t argp) 0 /* rid */); wait_on_mask( 1 << tag ); + if (spu.init.id == 0) { + return_function_info(); + } + #if 0 if (spu.init.id==0) spu_test_misc(spu.init.id); -- cgit v1.2.3 From afaa53040bd01ca86762e7d7b1a5a65810767921 Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Fri, 3 Oct 2008 18:00:43 -0600 Subject: CELL: changes to generate SPU code for stenciling This set of code changes are for stencil code generation support. Both one-sided and two-sided stenciling are supported. In addition to the raw code generation changes, these changes had to be made elsewhere in the system: - Added new "register set" feature to the SPE assembly generation. A "register set" is a way to allocate multiple registers and free them all at the same time, delegating register allocation management to the spe_function unit. It's quite useful in complex register allocation schemes (like stenciling). - Added and improved SPE macro calculations. These are operations between registers and unsigned integer immediates. In many cases, the calculation can be performed with a single instruction; the macros will generate the single instruction if possible, or generate a register load and register-to-register operation if not. These macro functions are: spe_load_uint() (which has new ways to load a value in a single instruction), spe_and_uint(), spe_xor_uint(), spe_compare_equal_uint(), and spe_compare_greater_uint(). - Added facing to fragment generation. While rendering, the rasterizer needs to be able to determine front- and back-facing fragments, in order to correctly apply two-sided stencil. That requires these changes: - Added front_winding field to the cell_command_render block, so that the state tracker could communicate to the rasterizer what it considered to be the front-facing direction. - Added fragment facing as an input to the fragment function. - Calculated facing is passed during emit_quad(). --- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 246 +++++- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 41 +- src/gallium/drivers/cell/common.h | 1 + src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 881 ++++++++++++++++++--- src/gallium/drivers/cell/ppu/cell_render.c | 1 + src/gallium/drivers/cell/ppu/cell_vbuf.c | 1 + src/gallium/drivers/cell/spu/spu_main.h | 3 +- src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 19 +- src/gallium/drivers/cell/spu/spu_per_fragment_op.h | 3 +- src/gallium/drivers/cell/spu/spu_render.c | 4 +- src/gallium/drivers/cell/spu/spu_tri.c | 35 +- src/gallium/drivers/cell/spu/spu_tri.h | 2 +- 12 files changed, 1091 insertions(+), 146 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index 491141f190..8a87e9abb1 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \ */ void spe_init_func(struct spe_function *p, unsigned code_size) { + register unsigned int i; + p->store = align_malloc(code_size, 16); p->num_inst = 0; p->max_inst = code_size / SPE_INST_SIZE; + p->set_count = 0; + memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0])); + /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile. */ - p->regs[0] = ~7; - p->regs[1] = (1U << (80 - 64)) - 1; + p->regs[0] = p->regs[1] = p->regs[2] = 1; + for (i = 80; i <= 127; i++) { + p->regs[i] = 1; + } p->print = false; p->indent = 0; @@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p) { unsigned i; for (i = 0; i < SPE_NUM_REGS; i++) { - const uint64_t mask = (1ULL << (i % 64)); - const unsigned idx = i / 64; - - assert(idx < 2); - if ((p->regs[idx] & mask) != 0) { - p->regs[idx] &= ~mask; + if (p->regs[i] == 0) { + p->regs[i] = 1; return i; } } @@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p) */ int spe_allocate_register(struct spe_function *p, int reg) { - const unsigned idx = reg / 64; - const unsigned bit = reg % 64; - assert(reg < SPE_NUM_REGS); - assert((p->regs[idx] & (1ULL << bit)) != 0); - - p->regs[idx] &= ~(1ULL << bit); + assert(p->regs[reg] == 0); + p->regs[reg] = 1; return reg; } /** - * Mark the given SPE register as "unallocated". + * Mark the given SPE register as "unallocated". Note that this should + * only be used on registers allocated in the current register set; an + * assertion will fail if an attempt is made to deallocate a register + * allocated in an earlier register set. */ void spe_release_register(struct spe_function *p, int reg) { - const unsigned idx = reg / 64; - const unsigned bit = reg % 64; + assert(reg < SPE_NUM_REGS); + assert(p->regs[reg] == 1); - assert(idx < 2); + p->regs[reg] = 0; +} - assert(reg < SPE_NUM_REGS); - assert((p->regs[idx] & (1ULL << bit)) == 0); +/** + * Start a new set of registers. This can be called if + * it will be difficult later to determine exactly what + * registers were actually allocated during a code generation + * sequence, and you really just want to deallocate all of them. + */ +void spe_allocate_register_set(struct spe_function *p) +{ + register unsigned int i; + + /* Keep track of the set count. If it ever wraps around to 0, + * we're in trouble. + */ + p->set_count++; + assert(p->set_count > 0); + + /* Increment the allocation count of all registers currently + * allocated. Then any registers that are allocated in this set + * will be the only ones with a count of 1; they'll all be released + * when the register set is released. + */ + for (i = 0; i < SPE_NUM_REGS; i++) { + if (p->regs[i] > 0) p->regs[i]++; + } +} + +void spe_release_register_set(struct spe_function *p) +{ + unsigned int i; + + /* If the set count drops below zero, we're in trouble. */ + assert(p->set_count > 0); + p->set_count--; - p->regs[idx] |= (1ULL << bit); + /* Drop the allocation level of all registers. Any allocated + * during this register set will drop to 0 and then become + * available. + */ + for (i = 0; i < SPE_NUM_REGS; i++) { + if (p->regs[i] > 0) p->regs[i]--; + } } @@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui) { /* If the whole value is in the lower 18 bits, use ila, which * doesn't sign-extend. Otherwise, if the two halfwords of - * the constant are identical, use ilh. Otherwise, we have - * to use ilhu followed by iohl. + * the constant are identical, use ilh. Otherwise, if every byte of + * the desired value is 0x00 or 0xff, we can use Form Select Mask for + * Bytes Immediate (fsmbi) to load the value in a single instruction. + * Otherwise, in the general case, we have to use ilhu followed by iohl. */ if ((ui & 0xfffc0000) == ui) { spe_ila(p, rT, ui); @@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui) else if ((ui >> 16) == (ui & 0xffff)) { spe_ilh(p, rT, ui & 0xffff); } + else if ( + ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) && + ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) && + ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) && + ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000) + ) { + unsigned int mask = 0; + /* fsmbi duplicates each bit in the given mask eight times, + * using a 16-bit value to initialize a 16-byte quadword. + * Each 4-bit nybble of the mask corresponds to a full word + * of the result; look at the value and figure out the mask + * (replicated for each word in the quadword), and then + * form the "select mask" to get the value. + */ + if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111; + if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222; + if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444; + if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888; + spe_fsmbi(p, rT, mask); + } else { + /* The general case: this usually uses two instructions, but + * may use only one if the low-order 16 bits of each word are 0. + */ spe_ilhu(p, rT, ui >> 16); if (ui & 0xffff) spe_iohl(p, rT, ui & 0xffff); } } +/* This function is constructed identically to spe_sor_uint() below. + * Changes to one should be made in the other. + */ +void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If we can, emit a single instruction, either And Byte Immediate + * (which uses the same constant across each byte), And Halfword Immediate + * (which sign-extends a 10-bit immediate to 16 bits and uses that + * across each halfword), or And Word Immediate (which sign-extends + * a 10-bit immediate to 32 bits). + * + * Otherwise, we'll need to use a temporary register. + */ + register unsigned int tmp; + + /* If the upper 23 bits are all 0s or all 1s, sign extension + * will work and we can use And Word Immediate + */ + tmp = ui & 0xfffffe00; + if (tmp == 0xfffffe00 || tmp == 0) { + spe_andi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric along halfword boundaries and + * the upper 7 bits of each halfword are all 0s or 1s, we + * can use And Halfword Immediate + */ + tmp = ui & 0xfe00fe00; + if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { + spe_andhi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric in each byte, then we can use + * the And Byte Immediate instruction. + */ + tmp = ui & 0x000000ff; + if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { + spe_andbi(p, rT, rA, tmp); + return; + } + + /* Otherwise, we'll have to use a temporary register. */ + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_and(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); +} + +/* This function is constructed identically to spe_and_uint() above. + * Changes to one should be made in the other. + */ +void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If we can, emit a single instruction, either Exclusive Or Byte + * Immediate (which uses the same constant across each byte), Exclusive + * Or Halfword Immediate (which sign-extends a 10-bit immediate to + * 16 bits and uses that across each halfword), or Exclusive Or Word + * Immediate (which sign-extends a 10-bit immediate to 32 bits). + * + * Otherwise, we'll need to use a temporary register. + */ + register unsigned int tmp; + + /* If the upper 23 bits are all 0s or all 1s, sign extension + * will work and we can use Exclusive Or Word Immediate + */ + tmp = ui & 0xfffffe00; + if (tmp == 0xfffffe00 || tmp == 0) { + spe_xori(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric along halfword boundaries and + * the upper 7 bits of each halfword are all 0s or 1s, we + * can use Exclusive Or Halfword Immediate + */ + tmp = ui & 0xfe00fe00; + if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { + spe_xorhi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric in each byte, then we can use + * the Exclusive Or Byte Immediate instruction. + */ + tmp = ui & 0x000000ff; + if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { + spe_xorbi(p, rT, rA, tmp); + return; + } + + /* Otherwise, we'll have to use a temporary register. */ + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_xor(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); +} + +void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If the comparison value is 9 bits or less, it fits inside a + * Compare Equal Word Immediate instruction. + */ + if ((ui & 0x000001ff) == ui) { + spe_ceqi(p, rT, rA, ui); + } + /* Otherwise, we're going to have to load a word first. */ + else { + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_ceq(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); + } +} + +void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If the comparison value is 10 bits or less, it fits inside a + * Compare Logical Greater Than Word Immediate instruction. + */ + if ((ui & 0x000003ff) == ui) { + spe_clgti(p, rT, rA, ui); + } + /* Otherwise, we're going to have to load a word first. */ + else { + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_clgt(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); + } +} void spe_splat(struct spe_function *p, unsigned rT, unsigned rA) diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index 61c7edeb60..cd2e245409 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -53,17 +53,26 @@ struct spe_function uint num_inst; uint max_inst; - /** - * Mask of used / unused registers - * - * Each set bit corresponds to an available register. Each cleared bit - * corresponds to an allocated register. + /** + * The "set count" reflects the number of nested register sets + * are allowed. In the unlikely case that we exceed the set count, + * register allocation will start to be confused, which is critical + * enough that we check for it. + */ + unsigned char set_count; + + /** + * Flags for used and unused registers. Each byte corresponds to a + * register; a 0 in that byte means that the register is available. + * A value of 1 means that the register was allocated in the current + * register set. Any other value N means that the register was allocated + * N register sets ago. * * \sa * spe_allocate_register, spe_allocate_available_register, - * spe_release_register + * spe_allocate_register_set, spe_release_register_set, spe_release_register, */ - uint64_t regs[SPE_NUM_REGS / 64]; + unsigned char regs[SPE_NUM_REGS]; boolean print; /**< print/dump instructions as they're emitted? */ int indent; /**< number of spaces to indent */ @@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p); extern int spe_allocate_available_register(struct spe_function *p); extern int spe_allocate_register(struct spe_function *p, int reg); extern void spe_release_register(struct spe_function *p, int reg); +extern void spe_allocate_register_set(struct spe_function *p); +extern void spe_release_register_set(struct spe_function *p); extern void spe_print_code(struct spe_function *p, boolean enable); extern void spe_indent(struct spe_function *p, int spaces); @@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i); extern void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui); +/** And immediate value into rT. */ +extern void +spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Xor immediate value into rT. */ +extern void +spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare equal with immediate value. */ +extern void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare greater with immediate value. */ +extern void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + /** Replicate word 0 of rA across rT. */ extern void spe_splat(struct spe_function *p, unsigned rT, unsigned rA); diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 99329fd8e2..c223bc1744 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -227,6 +227,7 @@ struct cell_command_render float xmin, ymin, xmax, ymax; /* XXX another dummy field */ uint min_index; boolean inline_verts; + uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */ }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 653afc235d..f920ae13b4 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -54,10 +54,12 @@ * \param ifragZ_reg register containing integer fragment Z values (in) * \param ifbZ_reg register containing integer frame buffer Z values (in/out) * \param zmask_reg register containing result of Z test/comparison (out) + * + * Returns true if the Z-buffer needs to be updated. */ -static void -gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, - struct spe_function *f, +static boolean +gen_depth_test(struct spe_function *f, + const struct pipe_depth_stencil_alpha_state *dsa, int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg) { /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_ @@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ; */ spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg); + return true; } + + return false; } @@ -238,22 +243,34 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, * it and have to allocate and load it again unnecessarily. */ static inline void -setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value) +setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r) { if (*is_already_set) return; *r = spe_allocate_available_register(f); - spe_load_float(f, *r, value); - *is_already_set = true; } static inline void -release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r) +release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r) { if (!*is_already_set) return; spe_release_register(f, r); *is_already_set = false; } +static inline void +setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value) +{ + if (*is_already_set) return; + setup_optional_register(f, is_already_set, r); + spe_load_float(f, *r, value); +} + +static inline void +release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r) +{ + release_optional_register(f, is_already_set, r); +} + /** * Generate SPE code to implement the given blend mode for a quad of pixels. * \param f SPE function to append instruction onto. @@ -1117,6 +1134,633 @@ gen_colormask(struct spe_function *f, spe_release_register(f, colormask_reg); } +/* This function is annoyingly similar to gen_depth_test(), above, except + * that instead of comparing two varying values (i.e. fragment and buffer), + * we're comparing a varying value with a static value. As such, we have + * access to the Compare Immediate instructions where we don't in + * gen_depth_test(), which is what makes us very different. + * + * The return value in the stencil_pass_reg is a bitmask of valid + * fragments that also passed the stencil test. The bitmask of valid + * fragments that failed would be found in (mask_reg & ~stencil_pass_reg). + */ +static void +gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, + unsigned int mask_reg, unsigned int fbS_reg, + unsigned int stencil_pass_reg) +{ + /* Generate code that puts the set of passing fragments into the stencil_pass_reg + * register, taking into account whether each fragment was active to begin with. + */ + switch (state->func) { + case PIPE_FUNC_EQUAL: + /* stencil_pass = mask & (s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + /* stencil_fail = mask & ~stencil_pass */ + break; + + case PIPE_FUNC_NOTEQUAL: + /* stencil_pass = mask & ~(s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + break; + + case PIPE_FUNC_GREATER: + /* stencil_pass = mask & (s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + break; + + case PIPE_FUNC_LESS: { + /* stencil_pass = mask & (reference > s) */ + /* There's no convenient Compare Less Than Immediate instruction, so + * we'll have to do this one the harder way, by loading a register and + * comparing directly. Compare Logical Greater Than Word (clgt) + * treats its operands as unsigned - no sign extension. + */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + break; + } + + case PIPE_FUNC_LEQUAL: + /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + break; + + case PIPE_FUNC_GEQUAL: { + /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */ + /* As above, we have to do this by loading a register */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + break; + } + + case PIPE_FUNC_NEVER: + /* stencil_pass = mask & 0 = 0 */ + spe_load_uint(f, stencil_pass_reg, 0); + spe_move(f, stencil_pass_reg, mask_reg); /* zmask = mask */ + break; + + case PIPE_FUNC_ALWAYS: + /* stencil_pass = mask & 1 = mask */ + spe_move(f, stencil_pass_reg, mask_reg); + break; + } + + /* The fragments that passed the stencil test are now in stencil_pass_reg. + * The fragments that failed would be (mask_reg & ~stencil_pass_reg). + */ +} + +/* This function generates code that calculates a set of new stencil values + * given the earlier values and the operation to apply. It does not + * apply any tests. It is intended to be called up to 3 times + * (for the stencil fail operation, for the stencil pass-z fail operation, + * and for the stencil pass-z pass operation) to collect up to three + * possible sets of values, and for the caller to combine them based + * on the result of the tests. + * + * stencil_max_value should be (2^n - 1) where n is the number of bits + * in the stencil buffer - in other words, it should be usable as a mask. + */ +static void +gen_stencil_values(struct spe_function *f, unsigned int stencil_op, + unsigned int stencil_ref_value, unsigned int stencil_max_value, + unsigned int fbS_reg, unsigned int newS_reg) +{ + /* The code below assumes that newS_reg and fbS_reg are not the same + * register; if they can be, the calculations below will have to use + * an additional temporary register. For now, mark the assumption + * with an assertion that will fail if they are the same. + */ + ASSERT(fbS_reg != newS_reg); + + /* The code also assumes the the stencil_max_value is of the form + * 2^n-1 and can therefore be used as a mask for the valid bits in + * addition to a maximum. Make sure this is the case as well. + * The clever math below exploits the fact that incrementing a + * binary number serves to flip all the bits of a number starting at + * the LSB and continuing to (and including) the first zero bit + * found. That means that a number and its increment will always + * have at least one bit in common (the high order bit, if nothing + * else) *unless* the number is zero, *or* the number is of a form + * consisting of some number of 1s in the low-order bits followed + * by nothing but 0s in the high-order bits. The latter case + * implies it's of the form 2^n-1. + */ + ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0); + + switch(stencil_op) { + case PIPE_STENCIL_OP_KEEP: + /* newS = S */ + spe_move(f, newS_reg, fbS_reg); + break; + + case PIPE_STENCIL_OP_ZERO: + /* newS = 0 */ + spe_zero(f, newS_reg); + break; + + case PIPE_STENCIL_OP_REPLACE: + /* newS = stencil reference value */ + spe_load_uint(f, newS_reg, stencil_ref_value); + break; + + case PIPE_STENCIL_OP_INCR: { + /* newS = (s == max ? max : s + 1) */ + unsigned int equals_reg = spe_allocate_available_register(f); + + spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value); + /* Add Word Immediate computes rT = rA + 10-bit signed immediate */ + spe_ai(f, newS_reg, fbS_reg, 1); + /* Select from the current value or the new value based on the equality test */ + spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + + spe_release_register(f, equals_reg); + break; + } + case PIPE_STENCIL_OP_DECR: { + /* newS = (s == 0 ? 0 : s - 1) */ + unsigned int equals_reg = spe_allocate_available_register(f); + + spe_compare_equal_uint(f, equals_reg, fbS_reg, 0); + /* Add Word Immediate with a (-1) value works */ + spe_ai(f, newS_reg, fbS_reg, -1); + /* Select from the current value or the new value based on the equality test */ + spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + + spe_release_register(f, equals_reg); + break; + } + case PIPE_STENCIL_OP_INCR_WRAP: + /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can + * do a normal add and mask off the correct bits + */ + spe_ai(f, newS_reg, fbS_reg, 1); + spe_and_uint(f, newS_reg, newS_reg, stencil_max_value); + break; + + case PIPE_STENCIL_OP_DECR_WRAP: + /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */ + spe_ai(f, newS_reg, fbS_reg, -1); + spe_and_uint(f, newS_reg, newS_reg, stencil_max_value); + break; + + case PIPE_STENCIL_OP_INVERT: + /* newS = ~s. We take advantage of the mask/max value to invert only + * the valid bits for the field so we don't have to do an extra "and". + */ + spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value); + break; + + default: + ASSERT(0); + } +} + + +/* This function generates code to get all the necessary possible + * stencil values. For each of the output registers (fail_reg, + * zfail_reg, and zpass_reg), it either allocates a new register + * and calculates a new set of values based on the stencil operation, + * or it reuses a register allocation and calculation done for an + * earlier (matching) operation, or it reuses the fbS_reg register + * (if the stencil operation is KEEP, which doesn't change the + * stencil buffer). + * + * Since this function allocates a variable number of registers, + * to avoid incurring complex logic to free them, they should + * be allocated after a spe_allocate_register_set() call + * and released by the corresponding spe_release_register_set() call. + */ +static void +gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa, + unsigned int fbS_reg, + unsigned int *fail_reg, unsigned int *zfail_reg, + unsigned int *zpass_reg, unsigned int *back_fail_reg, + unsigned int *back_zfail_reg, unsigned int *back_zpass_reg) +{ + unsigned zfail_op, back_zfail_op; + + /* Stenciling had better be enabled here */ + ASSERT(dsa->stencil[0].enabled); + + /* If the depth test is not enabled, it is treated as though it always + * passes. In particular, that means that the "zfail_op" (and the backfacing + * counterpart, if active) are not considered - a failing stencil test will + * trigger the "fail_op", and a passing stencil test will trigger the + * "zpass_op". + * + * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP, + * we keep them from being calculated. + */ + if (dsa->depth.enabled) { + zfail_op = dsa->stencil[0].zfail_op; + back_zfail_op = dsa->stencil[1].zfail_op; + } + else { + zfail_op = PIPE_STENCIL_OP_KEEP; + back_zfail_op = PIPE_STENCIL_OP_KEEP; + } + + /* One-sided or front-facing stencil */ + if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) { + *fail_reg = fbS_reg; + } + else { + *fail_reg = spe_allocate_available_register(f); + gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, + 0xff, fbS_reg, *fail_reg); + } + + if (zfail_op == PIPE_STENCIL_OP_KEEP) { + *zfail_reg = fbS_reg; + } + else if (zfail_op == dsa->stencil[0].fail_op) { + *zfail_reg = *fail_reg; + } + else { + *zfail_reg = spe_allocate_available_register(f); + gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, + 0xff, fbS_reg, *zfail_reg); + } + + if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) { + *zpass_reg = fbS_reg; + } + else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) { + *zpass_reg = *fail_reg; + } + else if (dsa->stencil[0].zpass_op == zfail_op) { + *zpass_reg = *zfail_reg; + } + else { + *zpass_reg = spe_allocate_available_register(f); + gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, + 0xff, fbS_reg, *zpass_reg); + } + + /* If two-sided stencil is enabled, we have more work to do. */ + if (!dsa->stencil[1].enabled) { + /* This just flags that the registers need not be deallocated later */ + *back_fail_reg = fbS_reg; + *back_zfail_reg = fbS_reg; + *back_zpass_reg = fbS_reg; + } + else { + /* Same calculations as above, but for the back stencil */ + if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) { + *back_fail_reg = fbS_reg; + } + else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) { + *back_fail_reg = *fail_reg; + } + else if (dsa->stencil[1].fail_op == zfail_op) { + *back_fail_reg = *zfail_reg; + } + else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) { + *back_fail_reg = *zpass_reg; + } + else { + *back_fail_reg = spe_allocate_available_register(f); + gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, + 0xff, fbS_reg, *back_fail_reg); + } + + if (back_zfail_op == PIPE_STENCIL_OP_KEEP) { + *back_zfail_reg = fbS_reg; + } + else if (back_zfail_op == dsa->stencil[0].fail_op) { + *back_zfail_reg = *fail_reg; + } + else if (back_zfail_op == zfail_op) { + *back_zfail_reg = *zfail_reg; + } + else if (back_zfail_op == dsa->stencil[0].zpass_op) { + *back_zfail_reg = *zpass_reg; + } + else if (back_zfail_op == dsa->stencil[1].fail_op) { + *back_zfail_reg = *back_fail_reg; + } + else { + *back_zfail_reg = spe_allocate_available_register(f); + gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, + 0xff, fbS_reg, *back_zfail_reg); + } + + if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) { + *back_zpass_reg = fbS_reg; + } + else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) { + *back_zpass_reg = *fail_reg; + } + else if (dsa->stencil[1].zpass_op == zfail_op) { + *back_zpass_reg = *zfail_reg; + } + else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) { + *back_zpass_reg = *zpass_reg; + } + else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) { + *back_zpass_reg = *back_fail_reg; + } + else if (dsa->stencil[1].zpass_op == back_zfail_op) { + *back_zpass_reg = *back_zfail_reg; + } + else { + *back_zfail_reg = spe_allocate_available_register(f); + gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, + 0xff, fbS_reg, *back_zpass_reg); + } + } /* End of calculations for back-facing stencil */ +} + +static boolean +gen_stencil_depth_test(struct spe_function *f, + const struct pipe_depth_stencil_alpha_state *dsa, + const int const facing_reg, + const int mask_reg, const int fragZ_reg, + const int fbZ_reg, const int fbS_reg) +{ + /* True if we've generated code that could require writeback to the + * depth and/or stencil buffers + */ + boolean modified_buffers = false; + + boolean need_to_calculate_stencil_values; + boolean need_to_writemask_stencil_values; + + /* Registers. We may or may not actually allocate these, depending + * on whether the state values indicate that we need them. + */ + unsigned int stencil_pass_reg, stencil_fail_reg; + unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values; + unsigned int stencil_writemask_reg; + unsigned int zmask_reg; + unsigned int newS_reg; + + /* Stenciling is quite complex: up to six different configurable stencil + * operations/calculations can be required (three each for front-facing + * and back-facing fragments). Many of those operations will likely + * be identical, so there's good reason to try to avoid calculating + * the same values more than once (which unfortunately makes the code less + * straightforward). + * + * To make register management easier, we start a new + * register set; we can release all the registers in the set at + * once, and avoid having to keep track of exactly which registers + * we allocate. We can still allocate and free registers as + * desired (if we know we no longer need a register), but we don't + * have to spend the complexity to track the more difficult variant + * register usage scenarios. + */ + spe_allocate_register_set(f); + + /* Calculate the writemask. If the writemask is trivial (either + * all 0s, meaning that we don't need to calculate any stencil values + * because they're not going to change the stencil anyway, or all 1s, + * meaning that we have to calculate the stencil values but do not + * need to mask them), we can avoid generating code. Don't forget + * that we need to consider backfacing stencil, if enabled. + */ + if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) { + /* Trivial: don't need to calculate stencil values, and don't need to + * write them back to the framebuffer. + */ + need_to_calculate_stencil_values = false; + need_to_writemask_stencil_values = false; + } + else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) { + /* Still trivial, but a little less so. We need to write the stencil + * values, but we don't need to mask them. + */ + need_to_calculate_stencil_values = true; + need_to_writemask_stencil_values = false; + } + else { + /* The general case: calculate, mask, and write */ + need_to_calculate_stencil_values = true; + need_to_writemask_stencil_values = true; + + /* While we're here, generate code that calculates what the + * writemask should be. If backface stenciling is enabled, + * and the backface writemask is not the same as the frontface + * writemask, we'll have to generate code that merges the + * two masks into a single effective mask based on fragment facing. + */ + stencil_writemask_reg = spe_allocate_available_register(f); + spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask); + if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) { + unsigned int back_write_mask_reg = spe_allocate_available_register(f); + spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask); + spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg); + spe_release_register(f, back_write_mask_reg); + } + } + + /* At least one-sided stenciling must be on. Generate code that + * runs the stencil test on the basic/front-facing stencil, leaving + * the mask of passing stencil bits in stencil_pass_reg. This mask will + * be used both to mask the set of active pixels, and also to + * determine how the stencil buffer changes. + * + * This test will *not* change the value in mask_reg (because we don't + * yet know whether to apply the two-sided stencil or one-sided stencil). + */ + stencil_pass_reg = spe_allocate_available_register(f); + gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg); + + /* If two-sided stenciling is on, generate code to run the stencil + * test on the backfacing stencil as well, and combine the two results + * into the one correct result based on facing. + */ + if (dsa->stencil[1].enabled) { + unsigned int temp_reg = spe_allocate_available_register(f); + gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg); + spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg); + spe_release_register(f, temp_reg); + } + + /* Generate code that, given the mask of valid fragments and the + * mask of valid fragments that passed the stencil test, computes + * the mask of valid fragments that failed the stencil test. We + * have to do this before we run a depth test (because the + * depth test should not be performed on fragments that failed the + * stencil test, and because the depth test will update the + * mask of valid fragments based on the results of the depth test). + */ + stencil_fail_reg = spe_allocate_available_register(f); + spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg); + /* Now remove the stenciled-out pixels from the valid fragment mask, + * so we can later use the valid fragment mask in the depth test. + */ + spe_and(f, mask_reg, mask_reg, stencil_pass_reg); + + /* We may not need to calculate stencil values, if the writemask is off */ + if (need_to_calculate_stencil_values) { + unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values; + unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values; + + /* Generate code that calculates exactly which stencil values we need, + * without calculating the same value twice (say, if two different + * stencil ops have the same value). This code will work for one-sided + * and two-sided stenciling (so that we take into account that operations + * may match between front and back stencils), and will also take into + * account whether the depth test is enabled (if the depth test is off, + * we don't need any of the zfail results, because the depth test always + * is considered to pass if it is disabled). Any register value that + * does not need to be calculated will come back with the same value + * that's in fbS_reg. + * + * This function will allocate a variant number of registers that + * will be released as part of the register set. + */ + gen_get_stencil_values(f, dsa, fbS_reg, + &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, + &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, + &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values); + + /* Tricky, tricky, tricky - the things we do to create optimal + * code... + * + * The various stencil values registers may overlap with each other + * and with fbS_reg arbitrarily (as any particular operation is + * only calculated once and stored in one register, no matter + * how many times it is used). So we can't change the values + * within those registers directly - if we change a value in a + * register that's being referenced by two different calculations, + * we've just unwittingly changed the second value as well... + * + * Avoid this by allocating new registers to hold the results + * (there may be 2, if the depth test is off, or 3, if it is on). + * These will be released as part of the register set. + */ + if (!dsa->stencil[1].enabled) { + /* The easy case: if two-sided stenciling is *not* enabled, we + * just use the front-sided values. + */ + stencil_fail_values = front_stencil_fail_values; + stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values; + stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values; + } + else { /* two-sided stencil enabled */ + /* Allocate new registers for the needed merged values */ + stencil_fail_values = spe_allocate_available_register(f); + spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg); + if (dsa->depth.enabled) { + stencil_pass_depth_fail_values = spe_allocate_available_register(f); + spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg); + } + else { + stencil_pass_depth_fail_values = fbS_reg; + } + stencil_pass_depth_pass_values = spe_allocate_available_register(f); + spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg); + } + } + + /* We now have all the stencil values we need. We also need + * the results of the depth test to figure out which + * stencil values will become the new stencil values. (Even if + * we aren't actually calculating stencil values, we need to apply + * the depth test if it's enabled.) + * + * The code generated by gen_depth_test() returns the results of the + * test in the given register, but also alters the mask_reg based + * on the results of the test. + */ + if (dsa->depth.enabled) { + zmask_reg = spe_allocate_available_register(f); + modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); + } + + if (need_to_calculate_stencil_values) { + /* If we need to writemask the stencil values before going into + * the stencil buffer, we'll have to use a new register to + * hold the new values. If not, we can just keep using the + * current register. + */ + if (need_to_writemask_stencil_values) { + newS_reg = spe_allocate_available_register(f); + spe_move(f, newS_reg, fbS_reg); + modified_buffers = true; + } + else { + newS_reg = fbS_reg; + } + + /* Merge in the selected stencil fail values */ + if (stencil_fail_values != fbS_reg) { + spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg); + } + + /* Same for the stencil pass/depth fail values. If this calculation + * is not needed (say, if depth test is off), then the + * stencil_pass_depth_fail_values register will be equal to fbS_reg + * and we'll skip the calculation. + */ + if (stencil_pass_depth_fail_values != fbS_reg) { + /* We don't actually have a stencil pass/depth fail mask yet. + * Calculate it here from the stencil passing mask and the + * depth passing mask. Note that zmask_reg *must* have been + * set above if we're here. + */ + unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f); + spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg); + + spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask); + + spe_release_register(f, stencil_pass_depth_fail_mask); + } + + /* Same for the stencil pass/depth pass mask */ + if (stencil_pass_depth_pass_values != fbS_reg) { + unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f); + spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg); + + spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask); + spe_release_register(f, stencil_pass_depth_pass_mask); + } + + /* Almost done. If we need to writemask, do it now, leaving the + * results in the fbS_reg register passed in. If we don't need + * to writemask, then the results are *already* in the fbS_reg, + * so there's nothing more to do. + */ + + if (need_to_writemask_stencil_values) { + /* The Select Bytes command makes a fine writemask. Where + * the mask is 0, the first (original) values are retained, + * effectively masking out changes. Where the mask is 1, the + * second (new) values are retained, incorporating changes. + */ + spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg); + } + } /* done calculating stencil values */ + + /* The stencil and/or depth values have been applied, and the + * mask_reg, fbS_reg, and fbZ_reg values have been updated. + * We're all done, except that we've allocated a fair number + * of registers that we didn't bother tracking. Release all + * those registers as part of the register set, and go home. + */ + spe_release_register_set(f); + + /* Return true if we could have modified the stencil and/or + * depth buffers. + */ + return modified_buffers; +} + + /** * Generate SPE code to implement the fragment operations (alpha test, * depth test, stencil test, blending, colormask, and final @@ -1156,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) const int fragB_reg = 10; /* vector float */ const int fragA_reg = 11; /* vector float */ const int mask_reg = 12; /* vector uint */ + const int facing_reg = 13; /* uint */ /* offset of quad from start of tile * XXX assuming 4-byte pixels for color AND Z/stencil!!!! @@ -1183,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_allocate_register(f, fragB_reg); spe_allocate_register(f, fragA_reg); spe_allocate_register(f, mask_reg); + spe_allocate_register(f, facing_reg); quad_offset_reg = spe_allocate_available_register(f); fbRGBA_reg = spe_allocate_available_register(f); @@ -1195,6 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) ASSERT(TILE_SIZE == 32); + spe_comment(f, 0, "Computing tile location in memory"); spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */ spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */ spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */ @@ -1205,124 +1852,164 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_release_register(f, y2_reg); } - if (dsa->alpha.enabled) { gen_alpha_test(dsa, f, mask_reg, fragA_reg); } + /* If we need the stencil buffers (because one- or two-sided stencil is + * enabled) or the depth buffer (because the depth test is enabled), + * go grab them. Note that if either one- or two-sided stencil is + * enabled, dsa->stencil[0].enabled will be true. + */ if (dsa->depth.enabled || dsa->stencil[0].enabled) { const enum pipe_format zs_format = cell->framebuffer.zsbuf->format; boolean write_depth_stencil; - int fbZ_reg = spe_allocate_available_register(f); /* Z values */ - int fbS_reg = spe_allocate_available_register(f); /* Stencil values */ + /* We may or may not need to allocate a register for Z or stencil values */ + boolean fbS_reg_set = false, fbZ_reg_set = false; + unsigned int fbS_reg, fbZ_reg = 0; + + spe_comment(f, 0, "Loading Z/stencil tile"); /* fetch quad of depth/stencil values from tile at (x,y) */ /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */ spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); - if (dsa->depth.enabled) { - /* Extract Z bits from fbZS_reg into fbZ_reg */ - if (zs_format == PIPE_FORMAT_S8Z24_UNORM || - zs_format == PIPE_FORMAT_X8Z24_UNORM) { - int mask_reg = spe_allocate_available_register(f); - spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */ - spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */ - spe_release_register(f, mask_reg); - /* OK, fbZ_reg has four 24-bit Z values now */ - } - else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || - zs_format == PIPE_FORMAT_Z24X8_UNORM) { - spe_rotmi(f, fbZ_reg, fbZS_reg, -8); /* fbZ = fbZS >> 8 */ - /* OK, fbZ_reg has four 24-bit Z values now */ - } - else if (zs_format == PIPE_FORMAT_Z32_UNORM) { - spe_move(f, fbZ_reg, fbZS_reg); - /* OK, fbZ_reg has four 32-bit Z values now */ - } - else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - spe_move(f, fbZ_reg, fbZS_reg); - /* OK, fbZ_reg has four 16-bit Z values now */ - } - else { - ASSERT(0); /* invalid format */ - } - - /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */ - if (zs_format == PIPE_FORMAT_S8Z24_UNORM || - zs_format == PIPE_FORMAT_X8Z24_UNORM || - zs_format == PIPE_FORMAT_Z24S8_UNORM || - zs_format == PIPE_FORMAT_Z24X8_UNORM) { - /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - /* fragZ = fragZ >> 8 */ - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - else if (zs_format == PIPE_FORMAT_Z32_UNORM) { - /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - } - else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - /* fragZ = fragZ >> 16 */ - spe_rotmi(f, fragZ_reg, fragZ_reg, -16); - } - } - else { - /* no Z test, but set Z to zero so we don't OR-in garbage below */ - spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */ + /* From the Z/stencil buffer format, pull out the bits we need for + * Z and/or stencil. We'll also convert the incoming fragment Z + * value in fragZ_reg from a floating point value in [0.0..1.0] to + * an unsigned integer value with the appropriate resolution. + */ + switch(zs_format) { + + case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ + case PIPE_FORMAT_X8Z24_UNORM: + if (dsa->depth.enabled) { + /* We need the Z part at least */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* four 24-bit Z values in the low-order bits */ + spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + } + if (dsa->stencil[0].enabled) { + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + /* four 8-bit Z values in the high-order bits */ + spe_rotmi(f, fbS_reg, fbZS_reg, -24); + } + break; + + case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ + case PIPE_FORMAT_Z24X8_UNORM: + if (dsa->depth.enabled) { + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* shift by 8 to get the upper 24-bit values */ + spe_rotmi(f, fbS_reg, fbZS_reg, -8); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + } + if (dsa->stencil[0].enabled) { + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + /* 8-bit stencil in the low-order bits - mask them out */ + spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); + } + break; + + case PIPE_FORMAT_Z32_UNORM: + if (dsa->depth.enabled) { + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 32-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + } + /* No stencil, so can't do anything there */ + break; + + case PIPE_FORMAT_Z16_UNORM: + if (dsa->depth.enabled) { + /* XXX Not sure this is correct, but it was here before, so we're + * going with it for now + */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 16-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); + } + /* No stencil */ + break; + + default: + ASSERT(0); /* invalid format */ } - + /* If stencil is enabled, use the stencil-specific code + * generator to generate both the stencil and depth (if needed) + * tests. Otherwise, if only depth is enabled, generate + * a quick depth test. The test generators themselves will + * report back whether the depth/stencil buffer has to be + * written back. + */ if (dsa->stencil[0].enabled) { - /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */ - if (zs_format == PIPE_FORMAT_S8Z24_UNORM || - zs_format == PIPE_FORMAT_X8Z24_UNORM) { - /* XXX extract with a shift */ - ASSERT(0); - } - else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || - zs_format == PIPE_FORMAT_Z24X8_UNORM) { - /* XXX extract with a mask */ - ASSERT(0); - } - } - else { - /* no stencil test, but set to zero so we don't OR-in garbage below */ - spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */ - } + /* This will perform the stencil and depth tests, and update + * the mask_reg, fbZ_reg, and fbS_reg as required by the + * tests. + */ + ASSERT(fbS_reg_set); + ASSERT(fbZ_reg_set); + spe_comment(f, 0, "Perform stencil test"); - if (dsa->stencil[0].enabled) { - /* XXX this may involve depth testing too */ - // gen_stencil_test(dsa, f, ... ); - ASSERT(0); + write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg); } else if (dsa->depth.enabled) { int zmask_reg = spe_allocate_available_register(f); - gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); + spe_comment(f, 0, "Perform depth test"); + write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); spe_release_register(f, zmask_reg); } - - /* do we need to write Z and/or Stencil back into framebuffer? */ - write_depth_stencil = (dsa->depth.writemask | - dsa->stencil[0].write_mask | - dsa->stencil[1].write_mask); + else { + write_depth_stencil = false; + } if (write_depth_stencil) { /* Merge latest Z and Stencil values into fbZS_reg. * fbZ_reg has four Z vals in bits [23..0] or bits [15..0]. * fbS_reg has four 8-bit Z values in bits [7..0]. */ + spe_comment(f, 0, "Storing depth/stencil values"); if (zs_format == PIPE_FORMAT_S8Z24_UNORM || zs_format == PIPE_FORMAT_X8Z24_UNORM) { - spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + if (fbS_reg_set) { + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } + else { + spe_move(f, fbZS_reg, fbZ_reg); + } } else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || zs_format == PIPE_FORMAT_Z24X8_UNORM) { spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + if (fbS_reg_set) { + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } } else if (zs_format == PIPE_FORMAT_Z32_UNORM) { spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ @@ -1341,11 +2028,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); } - spe_release_register(f, fbZ_reg); - spe_release_register(f, fbS_reg); + release_optional_register(f, &fbZ_reg_set, fbZ_reg); + release_optional_register(f, &fbS_reg_set, fbS_reg); } - /* Get framebuffer quad/colors. We'll need these for blending, * color masking, and to obey the quad/pixel mask. * Load: fbRGBA_reg = memory[color_tile + quad_offset] @@ -1354,8 +2040,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) */ spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg); - if (blend->blend_enable) { + spe_comment(f, 0, "Perform blending"); gen_blend(blend, blend_color, f, color_format, fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg); } @@ -1369,19 +2055,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) int rgba_reg = spe_allocate_available_register(f); /* Pack four float colors as four 32-bit int colors */ + spe_comment(f, 0, "Convert fragment colors to framebuffer colors"); gen_pack_colors(f, color_format, fragR_reg, fragG_reg, fragB_reg, fragA_reg, rgba_reg); if (blend->logicop_enable) { + spe_comment(f, 0, "Compute logic op"); gen_logicop(blend, f, rgba_reg, fbRGBA_reg); } if (blend->colormask != PIPE_MASK_RGBA) { + spe_comment(f, 0, "Compute color mask"); gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg); } - /* Mix fragment colors with framebuffer colors using the quad/pixel mask: * if (mask[i]) * rgba[i] = rgba[i]; @@ -1393,6 +2081,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) /* Store updated quad in tile: * memory[color_tile + quad_offset] = rgba_reg; */ + spe_comment(f, 0, "Store framebuffer colors"); spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg); spe_release_register(f, rgba_reg); diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c index dd25ae880e..79cb8df82f 100644 --- a/src/gallium/drivers/cell/ppu/cell_render.c +++ b/src/gallium/drivers/cell/ppu/cell_render.c @@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell) struct cell_command_render *render = &cell_global.command[i].render; render->prim_type = PIPE_PRIM_TRIANGLES; render->num_verts = cell->prim_buffer.num_verts; + render->front_winding = cell->rasterizer->front_winding; render->vertex_size = cell->vertex_info->size * 4; render->xmin = cell->prim_buffer.xmin; render->ymin = cell->prim_buffer.ymin; diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c index aa63435b93..578ddf62dc 100644 --- a/src/gallium/drivers/cell/ppu/cell_vbuf.c +++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c @@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr, render->opcode = CELL_CMD_RENDER; render->prim_type = cvbr->prim; + render->front_winding = cell->rasterizer->front_winding; render->num_indexes = nr_indices; render->min_index = min_index; diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 29a305232e..1cd577c23c 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y, vector float fragGreen, vector float fragBlue, vector float fragAlpha, - vector unsigned int mask); + vector unsigned int mask, + uint facing); /** Function for running fragment program */ typedef void (*spu_fragment_program_func)(vector float *inputs, diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index f107764fb2..d252fa6dc1 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y, vector float fragG, vector float fragB, vector float fragA, - vector unsigned int mask) + vector unsigned int mask, + uint facing) { vector float frag_aos[4]; unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */ @@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y, /* Form bitmask depending on color buffer format and colormask bits */ switch (spu.fb.color_format) { case PIPE_FORMAT_A8R8G8B8_UNORM: - if (spu.blend.colormask & (1<<0)) + if (spu.blend.colormask & PIPE_MASK_R) cmask |= 0x00ff0000; /* red */ - if (spu.blend.colormask & (1<<1)) + if (spu.blend.colormask & PIPE_MASK_G) cmask |= 0x0000ff00; /* green */ - if (spu.blend.colormask & (1<<2)) + if (spu.blend.colormask & PIPE_MASK_B) cmask |= 0x000000ff; /* blue */ - if (spu.blend.colormask & (1<<3)) + if (spu.blend.colormask & PIPE_MASK_A) cmask |= 0xff000000; /* alpha */ break; case PIPE_FORMAT_B8G8R8A8_UNORM: - if (spu.blend.colormask & (1<<0)) + if (spu.blend.colormask & PIPE_MASK_R) cmask |= 0x0000ff00; /* red */ - if (spu.blend.colormask & (1<<1)) + if (spu.blend.colormask & PIPE_MASK_G) cmask |= 0x00ff0000; /* green */ - if (spu.blend.colormask & (1<<2)) + if (spu.blend.colormask & PIPE_MASK_B) cmask |= 0xff000000; /* blue */ - if (spu.blend.colormask & (1<<3)) + if (spu.blend.colormask & PIPE_MASK_A) cmask |= 0x000000ff; /* alpha */ break; default: diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h index f817abf046..a61689c83a 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y, vector float fragGreen, vector float fragBlue, vector float fragAlpha, - vector unsigned int mask); + vector unsigned int mask, + uint facing); #endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 305dc98881..82dbeb26b7 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) v1 = (const float *) (vertices + indexes[j+1] * vertex_size); v2 = (const float *) (vertices + indexes[j+2] * vertex_size); - drawn += tri_draw(v0, v1, v2, tx, ty); + drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding); } //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3); @@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) printf("SPU %u: RENDER done\n", spu.init.id); } - - diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 0a8fb56a62..6039cd80b2 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -118,6 +118,8 @@ struct setup_stage { float oneoverarea; + uint facing; + uint tx, ty; int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy; @@ -274,7 +276,7 @@ eval_z(float x, float y) * overall. */ static INLINE void -emit_quad( int x, int y, mask_t mask ) +emit_quad( int x, int y, mask_t mask) { /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { @@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask ) fragZ, soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3], - mask); + mask, + setup.facing); } } @@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask ) outputs[0*4+1], outputs[0*4+2], outputs[0*4+3], - mask); + mask, + setup.facing); } } } @@ -483,7 +487,7 @@ static void flush_spans( void ) */ for (x = block(minleft); x <= block(maxright); x += 2) { #if 1 - emit_quad( x, setup.span.y, calculate_mask( x ) ); + emit_quad( x, setup.span.y, calculate_mask( x )); #endif } @@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft, eright->sy += lines; } +static float +determinant( const float *v0, + const float *v1, + const float *v2 ) +{ + /* edge vectors e = v0 - v2, f = v1 - v2 */ + const float ex = v0[0] - v2[0]; + const float ey = v0[1] - v2[1]; + const float fx = v1[0] - v2[0]; + const float fy = v1[1] - v2[1]; + + /* det = cross(e,f).z */ + return ex * fy - ey * fx; +} + /** * Draw triangle into tile at (tx, ty) (tile coords) * The tile data should have already been fetched. */ boolean -tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty) +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding) { setup.tx = tx; setup.ty = ty; @@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty) setup.cliprect_maxx = (tx + 1) * TILE_SIZE; setup.cliprect_maxy = (ty + 1) * TILE_SIZE; + /* Before we sort vertices, determine the facing of the triangle, + * which will be needed for front/back-face stencil application + */ + float det = determinant(v0, v1, v2); + setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW); + if (!setup_sort_vertices((struct vertex_header *) v0, (struct vertex_header *) v1, (struct vertex_header *) v2)) { diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h index aa694dd7c9..abc3d35160 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.h +++ b/src/gallium/drivers/cell/spu/spu_tri.h @@ -31,7 +31,7 @@ extern boolean -tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty); +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding); #endif /* SPU_TRI_H */ -- cgit v1.2.3 From 800c350d71132bbb5126bd89310df540332978f4 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 7 Oct 2008 16:14:27 -0600 Subject: cell: add support for fragment shader constant buffers --- src/gallium/drivers/cell/common.h | 1 + src/gallium/drivers/cell/ppu/cell_gen_fp.c | 10 +++++++++- src/gallium/drivers/cell/ppu/cell_state.h | 5 +++-- src/gallium/drivers/cell/ppu/cell_state_emit.c | 19 +++++++++++++++++++ src/gallium/drivers/cell/ppu/cell_state_shader.c | 5 ++++- src/gallium/drivers/cell/spu/spu_command.c | 22 ++++++++++++++++++++++ src/gallium/drivers/cell/spu/spu_main.h | 8 +++++--- 7 files changed, 63 insertions(+), 7 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index c223bc1744..d261c1a640 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -94,6 +94,7 @@ #define CELL_CMD_STATE_BIND_VS 18 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 +#define CELL_CMD_STATE_FS_CONSTANTS 21 #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index 131a2356fe..3065869d04 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -215,7 +215,15 @@ get_src_reg(struct codegen *gen, reg = gen->imm_regs[src->SrcRegister.Index][swizzle]; break; case TGSI_FILE_CONSTANT: - /* xxx fall-through for now / fix */ + { + /* offset is measured in quadwords, not bytes */ + int offset = src->SrcRegister.Index * 4 + swizzle; + reg = get_itemp(gen); + reg_is_itemp = TRUE; + /* Load: reg = memory[(machine_reg) + offset] */ + spe_lqd(gen->f, reg, gen->constants_reg, offset); + } + break; default: assert(0); } diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h index a7771a55a3..b193170f9c 100644 --- a/src/gallium/drivers/cell/ppu/cell_state.h +++ b/src/gallium/drivers/cell/ppu/cell_state.h @@ -44,8 +44,9 @@ #define CELL_NEW_TEXTURE 0x800 #define CELL_NEW_VERTEX 0x1000 #define CELL_NEW_VS 0x2000 -#define CELL_NEW_CONSTANTS 0x4000 -#define CELL_NEW_VERTEX_INFO 0x8000 +#define CELL_NEW_VS_CONSTANTS 0x4000 +#define CELL_NEW_FS_CONSTANTS 0x8000 +#define CELL_NEW_VERTEX_INFO 0x10000 extern void diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index a36fd3a601..cbfa393cfb 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -25,6 +25,7 @@ * **************************************************************************/ +#include "pipe/p_inlines.h" #include "util/u_memory.h" #include "cell_context.h" #include "cell_gen_fragment.h" @@ -162,6 +163,24 @@ cell_emit_state(struct cell_context *cell) } } + if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) { + const uint shader = PIPE_SHADER_FRAGMENT; + const uint num_const = cell->constants[shader].size / sizeof(float); + uint i, j; + float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float)); + uint64_t *ibuf = (uint64_t *) buf; + const float *constants = pipe_buffer_map(cell->pipe.screen, + cell->constants[shader].buffer, + PIPE_BUFFER_USAGE_CPU_READ); + ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS; + ibuf[1] = num_const; + j = 4; + for (i = 0; i < num_const; i++) { + buf[j++] = constants[i]; + } + pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer); + } + if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL | CELL_NEW_BLEND)) { diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c index 3a0d066da2..54a17eaf2b 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_shader.c +++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c @@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe, buf->buffer); cell->constants[shader].size = buf->size; - cell->dirty |= CELL_NEW_CONSTANTS; + if (shader == PIPE_SHADER_VERTEX) + cell->dirty |= CELL_NEW_VS_CONSTANTS; + else if (shader == PIPE_SHADER_FRAGMENT) + cell->dirty |= CELL_NEW_FS_CONSTANTS; } diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index ec9da5d887..91a4c137e7 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -231,6 +231,25 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp) } +static uint +cmd_state_fs_constants(const uint64_t *buffer, uint pos) +{ + const uint num_const = buffer[pos + 1]; + const float *constants = (const float *) &buffer[pos + 2]; + uint i; + + DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const); + + /* Expand each float to float[4] for SOA execution */ + for (i = 0; i < num_const; i++) { + spu.constants[i] = spu_splats(constants[i]); + } + + /* return new buffer pos (in 8-byte words) */ + return pos + 2 + num_const / 2; +} + + static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { @@ -456,6 +475,9 @@ cmd_batch(uint opcode) pos += sizeof(*fp) / 8; } break; + case CELL_CMD_STATE_FS_CONSTANTS: + pos = cmd_state_fs_constants(buffer, pos); + break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 1cd577c23c..82c9c69a3a 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -41,6 +41,9 @@ #define MAX_HEIGHT 1024 +#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */ + + /** * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels. * The data may be addressed through several different types. @@ -157,9 +160,8 @@ struct spu_global /** Current texture sampler function */ spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS]; - /** Fragment program constants (XXX preliminary/used) */ -#define MAX_CONSTANTS 32 - vector float constants[MAX_CONSTANTS]; + /** Fragment program constants */ + vector float constants[4 * CELL_MAX_CONSTANTS]; } ALIGN16_ATTRIB; -- cgit v1.2.3 From feb5a26bb1e39099abd1caf4a405776ea0124315 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 8 Oct 2008 20:33:24 -0600 Subject: cell: increase SPU_MAX_FRAGMENT_PROGRAM_INSTS --- src/gallium/drivers/cell/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index d261c1a640..5dc756023f 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -128,7 +128,7 @@ struct cell_command_fragment_ops /** Max instructions for fragment programs */ -#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128 +#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512 /** * Command to send a fragment program to SPUs. -- cgit v1.2.3 From 978799beb2a9c51550abb1f37bb6f63d06bc4717 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 13 Oct 2008 16:43:11 -0600 Subject: cell: initial work for mipmap texture filtering --- src/gallium/drivers/cell/common.h | 6 +- src/gallium/drivers/cell/ppu/cell_screen.c | 4 +- src/gallium/drivers/cell/ppu/cell_state_emit.c | 18 ++-- src/gallium/drivers/cell/ppu/cell_texture.c | 48 ++++++---- src/gallium/drivers/cell/ppu/cell_texture.h | 6 +- src/gallium/drivers/cell/spu/spu_command.c | 37 +++++--- src/gallium/drivers/cell/spu/spu_funcs.c | 1 + src/gallium/drivers/cell/spu/spu_main.h | 7 +- src/gallium/drivers/cell/spu/spu_texture.c | 120 ++++++++++++++++++------- src/gallium/drivers/cell/spu/spu_texture.h | 6 ++ 10 files changed, 176 insertions(+), 77 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 5dc756023f..e4de9a551d 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -67,6 +67,7 @@ #define CELL_MAX_SPUS 6 #define CELL_MAX_SAMPLERS 4 +#define CELL_MAX_TEXTURE_LEVELS 12 /* 2k x 2k */ #define TILE_SIZE 32 @@ -251,8 +252,9 @@ struct cell_command_texture { uint64_t opcode; /**< CELL_CMD_STATE_TEXTURE */ uint unit; - void *start; /**< Address in main memory */ - ushort width, height; + void *start[CELL_MAX_TEXTURE_LEVELS]; /**< Address in main memory */ + ushort width[CELL_MAX_TEXTURE_LEVELS]; + ushort height[CELL_MAX_TEXTURE_LEVELS]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c index 47ba6fa290..d223557950 100644 --- a/src/gallium/drivers/cell/ppu/cell_screen.c +++ b/src/gallium/drivers/cell/ppu/cell_screen.c @@ -76,11 +76,11 @@ cell_get_param(struct pipe_screen *screen, int param) case PIPE_CAP_TEXTURE_SHADOW_MAP: return 10; case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: - return 12; /* max 2Kx2K */ + return CELL_MAX_TEXTURE_LEVELS; case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: return 8; /* max 128x128x128 */ case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: - return 12; /* max 2Kx2K */ + return CELL_MAX_TEXTURE_LEVELS; default: return 10; } diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index cbfa393cfb..7090b4c99f 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -211,14 +211,20 @@ cell_emit_state(struct cell_context *cell) texture->opcode = CELL_CMD_STATE_TEXTURE; texture->unit = i; if (cell->texture[i]) { - texture->start = cell->texture[i]->tiled_data; - texture->width = cell->texture[i]->base.width[0]; - texture->height = cell->texture[i]->base.height[0]; + uint level; + for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) { + texture->start[level] = cell->texture[i]->tiled_data[level]; + texture->width[level] = cell->texture[i]->base.width[level]; + texture->height[level] = cell->texture[i]->base.height[level]; + } } else { - texture->start = NULL; - texture->width = 1; - texture->height = 1; + uint level; + for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) { + texture->start[level] = NULL; + texture->width[level] = 1; + texture->height[level] = 1; + } } } } diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index b6590dfb86..f5f81ac3cc 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -66,6 +66,8 @@ cell_texture_layout(struct cell_texture * spt) unsigned size; unsigned w_tile, h_tile; + assert(level < CELL_MAX_TEXTURE_LEVELS); + /* width, height, rounded up to tile size */ w_tile = align(width, TILE_SIZE); h_tile = align(height, TILE_SIZE); @@ -249,33 +251,41 @@ cell_tile_texture(struct cell_context *cell, struct cell_texture *texture) { struct pipe_screen *screen = cell->pipe.screen; - uint face = 0, level = 0, zslice = 0; - struct pipe_surface *surf; - const uint w = texture->base.width[0], h = texture->base.height[0]; + uint face = 0, level, zslice = 0; const uint *src; - /* temporary restrictions: */ - assert(w >= TILE_SIZE); - assert(h >= TILE_SIZE); - assert(w % TILE_SIZE == 0); - assert(h % TILE_SIZE == 0); + for (level = 0; level <= texture->base.last_level; level++) { + if (!texture->tiled_data[level]) { + struct pipe_surface *surf; - surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice, - PIPE_BUFFER_USAGE_CPU_WRITE); - ASSERT(surf); + const uint w = texture->base.width[level], h = texture->base.height[level]; - src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE); + if (w < 32 || h < 32) + continue; + /* temporary restrictions: */ + assert(w >= TILE_SIZE); + assert(h >= TILE_SIZE); + assert(w % TILE_SIZE == 0); + assert(h % TILE_SIZE == 0); - if (texture->tiled_data) { - align_free(texture->tiled_data); - } - texture->tiled_data = align_malloc(w * h * 4, 16); + surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice, + PIPE_BUFFER_USAGE_CPU_WRITE); + ASSERT(surf); + + src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE); - tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src); + if (texture->tiled_data[level]) { + align_free(texture->tiled_data[level]); + } + texture->tiled_data[level] = align_malloc(w * h * 4, 16); - pipe_surface_unmap(surf); + tile_copy_data(w, h, TILE_SIZE, texture->tiled_data[level], src); - pipe_surface_reference(&surf, NULL); + pipe_surface_unmap(surf); + + pipe_surface_reference(&surf, NULL); + } + } } diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h index 6d37e95ebc..6d35736984 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.h +++ b/src/gallium/drivers/cell/ppu/cell_texture.h @@ -40,15 +40,15 @@ struct cell_texture { struct pipe_texture base; - unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS]; - unsigned long stride[PIPE_MAX_TEXTURE_LEVELS]; + unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS]; + unsigned long stride[CELL_MAX_TEXTURE_LEVELS]; /* The data is held here: */ struct pipe_buffer *buffer; unsigned long buffer_size; - void *tiled_data; /* XXX this may be temporary */ /*ALIGN16*/ + void *tiled_data[CELL_MAX_TEXTURE_LEVELS]; /* XXX this may be temporary */ /*ALIGN16*/ }; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index 64890f6dbd..089af22415 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -301,6 +301,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler) DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit); spu.sampler[sampler->unit] = sampler->state; +#if 0 + if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + spu.sample_texture4[sampler->unit] = sample_texture4_lod; + } + else +#endif if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) { spu.sample_texture4[sampler->unit] = sample_texture4_bilinear; } @@ -314,24 +320,29 @@ static void cmd_state_texture(const struct cell_command_texture *texture) { const uint unit = texture->unit; - const uint width = texture->width; - const uint height = texture->height; + uint i; - DEBUG_PRINTF("TEXTURE [%u] at %p size %u x %u\n", - texture->unit, texture->start, - texture->width, texture->height); + DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit); - spu.texture[unit].start = texture->start; - spu.texture[unit].width = width; - spu.texture[unit].height = height; + for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { + uint width = texture->width[i]; + uint height = texture->height[i]; - spu.texture[unit].width4 = spu_splats((float) width); - spu.texture[unit].height4 = spu_splats((float) height); + DEBUG_PRINTF(" LEVEL %u: at %p size[0] %u x %u\n", i, + texture->start[i], texture->width[i], texture->height[i]); - spu.texture[unit].tiles_per_row = width / TILE_SIZE; + spu.texture[unit].level[i].start = texture->start[i]; + spu.texture[unit].level[i].width = width; + spu.texture[unit].level[i].height = height; - spu.texture[unit].tex_size_x_mask = spu_splats(width - 1); - spu.texture[unit].tex_size_y_mask = spu_splats(height - 1); + spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE; + + spu.texture[unit].level[i].width4 = spu_splats((float) width); + spu.texture[unit].level[i].height4 = spu_splats((float) height); + + spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1); + spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1); + } } diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c index 4c90b701ee..f2946010bd 100644 --- a/src/gallium/drivers/cell/spu/spu_funcs.c +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -100,6 +100,7 @@ spu_log2(vector float x) return spu_mul(v, k); } + static struct vec_4x4 spu_txp(vector float s, vector float t, vector float r, vector float q, unsigned unit) diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index e3960dbe8b..9515543efe 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -107,7 +107,7 @@ struct spu_framebuffer } ALIGN16_ATTRIB; -struct spu_texture +struct spu_texture_level { void *start; ushort width, height; @@ -118,6 +118,11 @@ struct spu_texture vector unsigned int tex_size_y_mask; /**< splat(height-1) */ } ALIGN16_ATTRIB; +struct spu_texture +{ + struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS]; +} ALIGN16_ATTRIB; + /** * All SPU global/context state will be in a singleton object of this type: diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 96ef88822a..96c09e3ccb 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -27,6 +27,7 @@ #include +#include #include "pipe/p_compiler.h" #include "spu_main.h" @@ -42,11 +43,12 @@ void invalidate_tex_cache(void) { + uint lvl = 0; uint unit = 0; - uint bytes = 4 * spu.texture[unit].width - * spu.texture[unit].height; + uint bytes = 4 * spu.texture[unit].level[lvl].width + * spu.texture[unit].level[lvl].height; - spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes); + spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes); } @@ -64,15 +66,17 @@ invalidate_tex_cache(void) * a time. */ static void -get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels) +get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y, + vec_uint4 *texels) { - const unsigned texture_ea = (uintptr_t) spu.texture[unit].start; + const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; + const unsigned texture_ea = (uintptr_t) tlevel->start; vec_uint4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */ vec_uint4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */ const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */ const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */ - const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row); + const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row); const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t)); qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x); @@ -104,17 +108,18 @@ sample_texture4_nearest(vector float s, vector float t, vector float r, vector float q, uint unit, vector float colors[4]) { - vector float ss = spu_mul(s, spu.texture[unit].width4); - vector float tt = spu_mul(t, spu.texture[unit].height4); + const uint lvl = 0; + vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4); + vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4); vector unsigned int is = spu_convtu(ss, 0); vector unsigned int it = spu_convtu(tt, 0); vec_uint4 texels[4]; /* PIPE_TEX_WRAP_REPEAT */ - is = spu_and(is, spu.texture[unit].tex_size_x_mask); - it = spu_and(it, spu.texture[unit].tex_size_y_mask); + is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask); + it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask); - get_four_texels(unit, is, it, texels); + get_four_texels(unit, lvl, is, it, texels); /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */ spu_unpack_A8R8G8B8_transpose4(texels, colors); @@ -130,8 +135,9 @@ sample_texture4_bilinear(vector float s, vector float t, vector float r, vector float q, uint unit, vector float colors[4]) { - vector float ss = spu_madd(s, spu.texture[unit].width4, spu_splats(-0.5f)); - vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f)); + const uint lvl = 0; + vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, spu_splats(-0.5f)); + vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f)); vector unsigned int is0 = spu_convtu(ss, 0); vector unsigned int it0 = spu_convtu(tt, 0); @@ -141,17 +147,17 @@ sample_texture4_bilinear(vector float s, vector float t, vector unsigned int it1 = spu_add(it0, 1); /* PIPE_TEX_WRAP_REPEAT */ - is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask); - it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask); - is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask); - it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask); + is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask); + it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask); + is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask); + it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask); /* get packed int texels */ vector unsigned int texels[16]; - get_four_texels(unit, is0, it0, texels + 0); /* upper-left */ - get_four_texels(unit, is1, it0, texels + 4); /* upper-right */ - get_four_texels(unit, is0, it1, texels + 8); /* lower-left */ - get_four_texels(unit, is1, it1, texels + 12); /* lower-right */ + get_four_texels(unit, lvl, is0, it0, texels + 0); /* upper-left */ + get_four_texels(unit, lvl, is1, it0, texels + 4); /* upper-right */ + get_four_texels(unit, lvl, is0, it1, texels + 8); /* lower-left */ + get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */ /* XXX possibly rework following code to compute the weighted sample * colors with integer arithmetic for fewer int->float conversions. @@ -270,10 +276,11 @@ sample_texture4_bilinear_2(vector float s, vector float t, vector float r, vector float q, uint unit, vector float colors[4]) { + const uint lvl = 0; static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; /* Scale texcoords by size of texture, and add half pixel bias */ - vector float ss = spu_madd(s, spu.texture[unit].width4, half); - vector float tt = spu_madd(t, spu.texture[unit].height4, half); + vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half); + vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half); /* convert float coords to fixed-pt coords with 8 fraction bits */ vector unsigned int is = spu_convtu(ss, 8); @@ -294,17 +301,17 @@ sample_texture4_bilinear_2(vector float s, vector float t, vector unsigned int it1 = spu_add(it0, 1); /* PIPE_TEX_WRAP_REPEAT */ - is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask); - it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask); - is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask); - it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask); + is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask); + it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask); + is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask); + it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask); /* get packed int texels */ vector unsigned int texels[16]; - get_four_texels(unit, is0, it0, texels + 0); /* upper-left */ - get_four_texels(unit, is1, it0, texels + 4); /* upper-right */ - get_four_texels(unit, is0, it1, texels + 8); /* lower-left */ - get_four_texels(unit, is1, it1, texels + 12); /* lower-right */ + get_four_texels(unit, lvl, is0, it0, texels + 0); /* upper-left */ + get_four_texels(unit, lvl, is1, it0, texels + 4); /* upper-right */ + get_four_texels(unit, lvl, is0, it1, texels + 8); /* lower-left */ + get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */ /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */ { @@ -363,3 +370,54 @@ sample_texture4_bilinear_2(vector float s, vector float t, cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); colors[3] = spu_convtf(cSum, 24); } + + + +/** + * Compute level of detail factor from texcoords. + */ +static float +compute_lambda(uint unit, vector float s, vector float t) +{ + uint lvl = 0; + float width = spu.texture[unit].level[lvl].width; + float height = spu.texture[unit].level[lvl].width; + float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0)); + float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0)); + float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0)); + float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0)); + float x = dsdx * dsdx + dtdx * dtdx; + float y = dsdy * dsdy + dtdy * dtdy; + float rho = x > y ? x : y; + rho = sqrtf(rho); + float lambda = logf(rho) * 1.442695f; + return lambda; +} + + + +/** + * Texture sampling with level of detail selection. + */ +void +sample_texture4_lod(vector float s, vector float t, + vector float r, vector float q, + uint unit, vector float colors[4]) +{ + float lambda = compute_lambda(unit, s, t); + + if (lambda < spu.sampler[unit].min_lod) + lambda = spu.sampler[unit].min_lod; + else if (lambda > spu.sampler[unit].max_lod) + lambda = spu.sampler[unit].max_lod; + + /* hack for now */ + int level = (int) lambda; + if (level > 3) + level = 3; + + /* + sample_texture4_bilinear_2(s, t, r, q, unit, level, colors); + */ +} + diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h index 38a17deda2..4802f7c47c 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.h +++ b/src/gallium/drivers/cell/spu/spu_texture.h @@ -53,4 +53,10 @@ sample_texture4_bilinear_2(vector float s, vector float t, uint unit, vector float colors[4]); +extern void +sample_texture4_lod(vector float s, vector float t, + vector float r, vector float q, + uint unit, vector float colors[4]); + + #endif /* SPU_TEXTURE_H */ -- cgit v1.2.3 From 8f7c6b55ae962e30f32cfec9a14a652d3b5b5943 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 14 Oct 2008 17:11:29 -0600 Subject: cell: support for cubemaps Though, progs/demos/cubemap.c doesn't quite work right... --- src/gallium/drivers/cell/common.h | 1 + src/gallium/drivers/cell/ppu/cell_state_emit.c | 2 + src/gallium/drivers/cell/ppu/cell_texture.c | 37 ++++-- src/gallium/drivers/cell/spu/spu_command.c | 17 ++- src/gallium/drivers/cell/spu/spu_funcs.c | 2 +- src/gallium/drivers/cell/spu/spu_main.h | 4 +- src/gallium/drivers/cell/spu/spu_texture.c | 171 ++++++++++++++++++++++--- src/gallium/drivers/cell/spu/spu_texture.h | 21 ++- 8 files changed, 214 insertions(+), 41 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index e4de9a551d..c1e78f4db3 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -251,6 +251,7 @@ struct cell_command_sampler struct cell_command_texture { uint64_t opcode; /**< CELL_CMD_STATE_TEXTURE */ + uint target; /**< PIPE_TEXTURE_x */ uint unit; void *start[CELL_MAX_TEXTURE_LEVELS]; /**< Address in main memory */ ushort width[CELL_MAX_TEXTURE_LEVELS]; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index cae546b700..d4a867ffcf 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -217,6 +217,7 @@ cell_emit_state(struct cell_context *cell) texture->width[level] = cell->texture[i]->base.width[level]; texture->height[level] = cell->texture[i]->base.height[level]; } + texture->target = cell->texture[i]->base.target; } else { uint level; @@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell) texture->width[level] = 0; texture->height[level] = 0; } + texture->target = 0; } } } diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 4fd66bdea0..4c92ef154f 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -137,6 +137,7 @@ cell_texture_release(struct pipe_screen *screen, */ if (--(*pt)->refcount <= 0) { struct cell_texture *ct = cell_texture(*pt); + uint i; /* DBG("%s deleting %p\n", __FUNCTION__, (void *) ct); @@ -144,6 +145,12 @@ cell_texture_release(struct pipe_screen *screen, pipe_buffer_reference(screen, &ct->buffer, NULL); + for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { + if (ct->tiled_data[i]) { + FREE(ct->tiled_data[i]); + } + } + FREE(ct); } *pt = NULL; @@ -204,27 +211,33 @@ static void cell_twiddle_texture(struct pipe_screen *screen, struct pipe_surface *surface) { - struct cell_texture *texture = cell_texture(surface->texture); + struct cell_texture *ct = cell_texture(surface->texture); const uint level = surface->level; - const uint texWidth = texture->base.width[level]; - const uint texHeight = texture->base.height[level]; + const uint texWidth = ct->base.width[level]; + const uint texHeight = ct->base.height[level]; const uint bufWidth = align(texWidth, TILE_SIZE); const uint bufHeight = align(texHeight, TILE_SIZE); const void *map = pipe_buffer_map(screen, surface->buffer, PIPE_BUFFER_USAGE_CPU_READ); const uint *src = (const uint *) ((const ubyte *) map + surface->offset); - switch (texture->base.format) { + switch (ct->base.format) { case PIPE_FORMAT_A8R8G8B8_UNORM: - /* free old tiled data */ - if (texture->tiled_data[level]) { - align_free(texture->tiled_data[level]); + { + int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; + int offset = bufWidth * bufHeight * 4 * surface->face; + uint *dst; + + if (!ct->tiled_data[level]) { + ct->tiled_data[level] = + align_malloc(bufWidth * bufHeight * 4 * numFaces, 16); + } + + dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset); + + twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, + surface->stride, src); } - /* alloc new tiled data */ - texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16); - twiddle_image_uint(texWidth, texHeight, TILE_SIZE, - texture->tiled_data[level], - surface->stride, src); break; default: printf("Cell: twiddle unsupported texture format\n"); diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index b1efe97e76..c951fa6f31 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -301,7 +301,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) */ static void update_tex_masks(struct spu_texture *texture, - const struct pipe_sampler_state *sampler) + const struct pipe_sampler_state *sampler, + uint unit) { uint i; @@ -328,6 +329,11 @@ update_tex_masks(struct spu_texture *texture, texture->level[i].scale_t = spu_splats(1.0f); } } + + /* XXX temporary hack */ + if (texture->target == PIPE_TEXTURE_CUBE) { + spu.sample_texture4[unit] = sample_texture4_cube; + } } @@ -378,7 +384,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler) ASSERT(0); } - update_tex_masks(&spu.texture[unit], &spu.sampler[unit]); + update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit); } @@ -393,6 +399,7 @@ cmd_state_texture(const struct cell_command_texture *texture) DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit); spu.texture[unit].max_level = 0; + spu.texture[unit].target = texture->target; for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { uint width = texture->width[i]; @@ -408,6 +415,10 @@ cmd_state_texture(const struct cell_command_texture *texture) spu.texture[unit].level[i].tiles_per_row = (width + TILE_SIZE - 1) / TILE_SIZE; + spu.texture[unit].level[i].bytes_per_image = + 4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1)) + * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1)); + spu.texture[unit].level[i].max_s = spu_splats((int) width - 1); spu.texture[unit].level[i].max_t = spu_splats((int) height - 1); @@ -415,7 +426,7 @@ cmd_state_texture(const struct cell_command_texture *texture) spu.texture[unit].max_level = i; } - update_tex_masks(&spu.texture[unit], &spu.sampler[unit]); + update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit); //Debug=0; } diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c index 66b82f673d..5c3ee305d4 100644 --- a/src/gallium/drivers/cell/spu/spu_funcs.c +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q, unsigned unit) { struct vec_4x4 colors; - spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v); + spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v); return colors; } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 45c6f4ced1..8781041bff 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level, + uint unit, uint level, uint face, vector float colors[4]); @@ -113,6 +113,7 @@ struct spu_texture_level void *start; ushort width, height; ushort tiles_per_row; + uint bytes_per_image; /** texcoord scale factors */ vector float scale_s, scale_t; /** texcoord masks (if REPEAT then size-1, else ~0) */ @@ -126,6 +127,7 @@ struct spu_texture { struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS]; uint max_level; + uint target; /**< PIPE_TEXTURE_x */ } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index b21c43a467..2570f02c73 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -48,6 +48,9 @@ invalidate_tex_cache(void) uint bytes = 4 * spu.texture[unit].level[lvl].width * spu.texture[unit].level[lvl].height; + if (spu.texture[unit].target == PIPE_TEXTURE_CUBE) + bytes *= 6; + spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes); } } @@ -67,11 +70,11 @@ invalidate_tex_cache(void) * a time. */ static void -get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y, +get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y, vec_uint4 *texels) { const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; - const unsigned texture_ea = (uintptr_t) tlevel->start; + unsigned texture_ea = (uintptr_t) tlevel->start; const vec_int4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */ const vec_int4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */ const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */ @@ -88,6 +91,8 @@ get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y, vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset); + texture_ea = texture_ea + face * tlevel->bytes_per_image; + spu_dcache_fetch_unaligned((qword *) & texels[0], texture_ea + spu_extract(offset, 0), 4); spu_dcache_fetch_unaligned((qword *) & texels[1], @@ -121,7 +126,8 @@ spu_clamp(vector signed int vec, vector signed int max) void sample_texture4_nearest(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level, vector float colors[4]) + uint unit, uint level, uint face, + vector float colors[4]) { const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; vector float ss = spu_mul(s, tlevel->scale_s); @@ -138,7 +144,7 @@ sample_texture4_nearest(vector float s, vector float t, is = spu_clamp(is, tlevel->max_s); it = spu_clamp(it, tlevel->max_t); - get_four_texels(unit, level, is, it, texels); + get_four_texels(unit, level, face, is, it, texels); /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */ spu_unpack_A8R8G8B8_transpose4(texels, colors); @@ -152,11 +158,14 @@ sample_texture4_nearest(vector float s, vector float t, void sample_texture4_bilinear(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level, vector float colors[4]) + uint unit, uint level, uint face, + vector float colors[4]) { const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; - vector float ss = spu_madd(s, tlevel->scale_s, spu_splats(-0.5f)); - vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f)); + static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; + + vector float ss = spu_madd(s, tlevel->scale_s, half); + vector float tt = spu_madd(t, tlevel->scale_t, half); vector signed int is0 = spu_convts(ss, 0); vector signed int it0 = spu_convts(tt, 0); @@ -179,10 +188,10 @@ sample_texture4_bilinear(vector float s, vector float t, /* get packed int texels */ vector unsigned int texels[16]; - get_four_texels(unit, level, is0, it0, texels + 0); /* upper-left */ - get_four_texels(unit, level, is1, it0, texels + 4); /* upper-right */ - get_four_texels(unit, level, is0, it1, texels + 8); /* lower-left */ - get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */ + get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */ + get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */ + get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */ + get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */ /* XXX possibly rework following code to compute the weighted sample * colors with integer arithmetic for fewer int->float conversions. @@ -299,10 +308,12 @@ transpose(vector unsigned int *mOut0, void sample_texture4_bilinear_2(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level, vector float colors[4]) + uint unit, uint level, uint face, + vector float colors[4]) { const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; + /* Scale texcoords by size of texture, and add half pixel bias */ vector float ss = spu_madd(s, tlevel->scale_s, half); vector float tt = spu_madd(t, tlevel->scale_t, half); @@ -339,10 +350,10 @@ sample_texture4_bilinear_2(vector float s, vector float t, /* get packed int texels */ vector unsigned int texels[16]; - get_four_texels(unit, level, is0, it0, texels + 0); /* upper-left */ - get_four_texels(unit, level, is1, it0, texels + 4); /* upper-right */ - get_four_texels(unit, level, is0, it1, texels + 8); /* lower-left */ - get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */ + get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */ + get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */ + get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */ + get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */ /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */ { @@ -433,7 +444,8 @@ compute_lambda(uint unit, vector float s, vector float t) void sample_texture4_lod(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level_ignored, vector float colors[4]) + uint unit, uint level_ignored, uint face, + vector float colors[4]) { /* * Note that we're computing a lambda/lod here that's used for all @@ -452,15 +464,136 @@ sample_texture4_lod(vector float s, vector float t, if (lambda <= 0.0f) { /* magnify */ - spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors); + spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors); } else { /* minify */ int level = (int) (lambda + 0.5f); if (level > (int) spu.texture[unit].max_level) level = spu.texture[unit].max_level; - spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors); + spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors); /* XXX to do: mipmap level interpolation */ } } + +/** XXX need a SIMD version of this */ +static unsigned +choose_cube_face(float rx, float ry, float rz, float *newS, float *newT) +{ + /* + major axis + direction target sc tc ma + ---------- ------------------------------- --- --- --- + +rx TEXTURE_CUBE_MAP_POSITIVE_X_EXT -rz -ry rx + -rx TEXTURE_CUBE_MAP_NEGATIVE_X_EXT +rz -ry rx + +ry TEXTURE_CUBE_MAP_POSITIVE_Y_EXT +rx +rz ry + -ry TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT +rx -rz ry + +rz TEXTURE_CUBE_MAP_POSITIVE_Z_EXT +rx -ry rz + -rz TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT -rx -ry rz + */ + const float arx = fabsf(rx); + const float ary = fabsf(ry); + const float arz = fabsf(rz); + unsigned face; + float sc, tc, ma; + + if (arx > ary && arx > arz) { + if (rx >= 0.0F) { + face = PIPE_TEX_FACE_POS_X; + sc = -rz; + tc = -ry; + ma = arx; + } + else { + face = PIPE_TEX_FACE_NEG_X; + sc = rz; + tc = -ry; + ma = arx; + } + } + else if (ary > arx && ary > arz) { + if (ry >= 0.0F) { + face = PIPE_TEX_FACE_POS_Y; + sc = rx; + tc = rz; + ma = ary; + } + else { + face = PIPE_TEX_FACE_NEG_Y; + sc = rx; + tc = -rz; + ma = ary; + } + } + else { + if (rz > 0.0F) { + face = PIPE_TEX_FACE_POS_Z; + sc = rx; + tc = -ry; + ma = arz; + } + else { + face = PIPE_TEX_FACE_NEG_Z; + sc = -rx; + tc = -ry; + ma = arz; + } + } + + *newS = (sc / ma + 1.0F) * 0.5F; + *newT = (tc / ma + 1.0F) * 0.5F; + + return face; +} + + + +void +sample_texture4_cube(vector float s, vector float t, + vector float r, vector float q, + uint unit, uint level, int face_ignored, + vector float colors[4]) +{ + static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f}; + uint p, faces[4]; + float newS[4], newT[4]; + + /* Compute cube face referenced by the four sets of texcoords. + * XXX we should SIMD-ize this. + */ + for (p = 0; p < 4; p++) { + float rx = spu_extract(s, p); + float ry = spu_extract(t, p); + float rz = spu_extract(r, p); + faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]); + } + + if (faces[0] == faces[1] && + faces[0] == faces[2] && + faces[0] == faces[3]) { + /* GOOD! All four texcoords refer to the same cube face */ + s = (vector float) {newS[0], newS[1], newS[2], newS[3]}; + t = (vector float) {newT[0], newT[1], newT[2], newT[3]}; + sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors); + } + else { + /* BAD! The four texcoords refer to different faces */ + for (p = 0; p < 4; p++) { + vector float c[4]; + + sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]), + zero, zero, unit, level, faces[p], c); + + float red = spu_extract(c[0], p); + float green = spu_extract(c[1], p); + float blue = spu_extract(c[2], p); + float alpha = spu_extract(c[3], p); + + colors[0] = spu_insert(red, colors[0], p); + colors[1] = spu_insert(green, colors[1], p); + colors[2] = spu_insert(blue, colors[2], p); + colors[3] = spu_insert(alpha, colors[3], p); + } + } +} diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h index ec06a50b4a..08b891a4a8 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.h +++ b/src/gallium/drivers/cell/spu/spu_texture.h @@ -39,24 +39,35 @@ invalidate_tex_cache(void); extern void sample_texture4_nearest(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level, vector float colors[4]); + uint unit, uint level, uint face, + vector float colors[4]); extern void sample_texture4_bilinear(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level, vector float colors[4]); + uint unit, uint level, uint face, + vector float colors[4]); extern void sample_texture4_bilinear_2(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, vector float colors[4]); + vector float r, vector float q, + uint unit, uint level, uint face, + vector float colors[4]); extern void sample_texture4_lod(vector float s, vector float t, vector float r, vector float q, - uint unit, uint level, vector float colors[4]); + uint unit, uint level, uint face, + vector float colors[4]); + + +extern void +sample_texture4_cube(vector float s, vector float t, + vector float r, vector float q, + uint unit, uint level_ignored, int face_ignored, + vector float colors[4]); #endif /* SPU_TEXTURE_H */ -- cgit v1.2.3 From 41ccdde767e7aba6e8e6a9a035eacd6338c03a95 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 14 Oct 2008 17:22:40 -0600 Subject: cell: initial bits for 3D texture support --- src/gallium/drivers/cell/common.h | 1 + src/gallium/drivers/cell/ppu/cell_state_emit.c | 2 ++ src/gallium/drivers/cell/spu/spu_command.c | 13 +++++++++++-- src/gallium/drivers/cell/spu/spu_main.h | 8 ++++---- src/gallium/drivers/cell/spu/spu_texture.c | 2 ++ 5 files changed, 20 insertions(+), 6 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index c1e78f4db3..b0169b8e32 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -256,6 +256,7 @@ struct cell_command_texture void *start[CELL_MAX_TEXTURE_LEVELS]; /**< Address in main memory */ ushort width[CELL_MAX_TEXTURE_LEVELS]; ushort height[CELL_MAX_TEXTURE_LEVELS]; + ushort depth[CELL_MAX_TEXTURE_LEVELS]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index d4a867ffcf..bb694aa107 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -216,6 +216,7 @@ cell_emit_state(struct cell_context *cell) texture->start[level] = cell->texture[i]->tiled_data[level]; texture->width[level] = cell->texture[i]->base.width[level]; texture->height[level] = cell->texture[i]->base.height[level]; + texture->depth[level] = cell->texture[i]->base.depth[level]; } texture->target = cell->texture[i]->base.target; } @@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell) texture->start[level] = NULL; texture->width[level] = 0; texture->height[level] = 0; + texture->depth[level] = 0; } texture->target = 0; } diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index c951fa6f31..c28677ebf8 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -59,6 +59,14 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS] +static INLINE int +align(int value, int alignment) +{ + return (value + alignment - 1) & ~(alignment - 1); +} + + + /** * Tell the PPU that this SPU has finished copying a buffer to * local store and that it may be reused by the PPU. @@ -404,6 +412,7 @@ cmd_state_texture(const struct cell_command_texture *texture) for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { uint width = texture->width[i]; uint height = texture->height[i]; + uint depth = texture->depth[i]; DEBUG_PRINTF(" LEVEL %u: at %p size[0] %u x %u\n", i, texture->start[i], texture->width[i], texture->height[i]); @@ -411,13 +420,13 @@ cmd_state_texture(const struct cell_command_texture *texture) spu.texture[unit].level[i].start = texture->start[i]; spu.texture[unit].level[i].width = width; spu.texture[unit].level[i].height = height; + spu.texture[unit].level[i].depth = depth; spu.texture[unit].level[i].tiles_per_row = (width + TILE_SIZE - 1) / TILE_SIZE; spu.texture[unit].level[i].bytes_per_image = - 4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1)) - * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1)); + 4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth; spu.texture[unit].level[i].max_s = spu_splats((int) width - 1); spu.texture[unit].level[i].max_t = spu_splats((int) height - 1); diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 8781041bff..eff43b870c 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -111,15 +111,15 @@ struct spu_framebuffer struct spu_texture_level { void *start; - ushort width, height; + ushort width, height, depth; ushort tiles_per_row; uint bytes_per_image; /** texcoord scale factors */ - vector float scale_s, scale_t; + vector float scale_s, scale_t, scale_r; /** texcoord masks (if REPEAT then size-1, else ~0) */ - vector signed int mask_s, mask_t; + vector signed int mask_s, mask_t, mask_r; /** texcoord clamp limits */ - vector signed int max_s, max_t; + vector signed int max_s, max_t, max_r; } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 9e25094d13..42eb06a362 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -50,6 +50,8 @@ invalidate_tex_cache(void) if (spu.texture[unit].target == PIPE_TEXTURE_CUBE) bytes *= 6; + else if (spu.texture[unit].target == PIPE_TEXTURE_3D) + bytes *= spu.texture[unit].level[lvl].depth; spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes); } -- cgit v1.2.3 From 53951531ae7bfd64afae1ae55aac7f6ebd3fe4f5 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 15 Oct 2008 12:35:51 -0600 Subject: cell: propogate blend color to SPUs for the fallback fragment ops code --- src/gallium/drivers/cell/common.h | 4 ++ src/gallium/drivers/cell/ppu/cell_context.h | 1 + src/gallium/drivers/cell/ppu/cell_state_emit.c | 1 + src/gallium/drivers/cell/spu/spu_command.c | 1 + src/gallium/drivers/cell/spu/spu_main.h | 1 + src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 75 +++++++++++++++++++--- 6 files changed, 74 insertions(+), 9 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index b0169b8e32..3b5a25e165 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -118,12 +118,16 @@ /** * Command to specify per-fragment operations state and generated code. + * Note that the dsa, blend, blend_color fields are really only needed + * for the fallback/C per-pixel code. They're not used when we generate + * dynamic SPU fragment code (which is the normal case). */ struct cell_command_fragment_ops { uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ struct pipe_depth_stencil_alpha_state dsa; struct pipe_blend_state blend; + struct pipe_blend_color blend_color; unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 80a9b3d7e1..1fcf03c2b8 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -74,6 +74,7 @@ struct cell_fragment_shader_state struct cell_fragment_ops_key { struct pipe_blend_state blend; + struct pipe_blend_color blend_color; struct pipe_depth_stencil_alpha_state dsa; enum pipe_format color_format; enum pipe_format zs_format; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index bb694aa107..d2427584ba 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -52,6 +52,7 @@ lookup_fragment_ops(struct cell_context *cell) */ memset(&key, 0, sizeof(key)); key.blend = *cell->blend; + key.blend_color = cell->blend_color; key.dsa = *cell->depth_stencil; if (cell->framebuffer.cbufs[0]) diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index a07b312111..b521c3aecf 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -195,6 +195,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) /* Copy state info (for fallback case only) */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); + memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color)); /* Parity twist! For now, always use the fallback code by default, * only switching to codegen when specifically requested. This diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index eff43b870c..ca72baea8b 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -145,6 +145,7 @@ struct spu_global struct spu_framebuffer fb; struct pipe_depth_stencil_alpha_state depth_stencil_alpha; struct pipe_blend_state blend; + struct pipe_blend_color blend_color; struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; struct spu_texture texture[PIPE_MAX_SAMPLERS]; struct vertex_info vertex_info; diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index 9404704abf..f8ffc70492 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -260,7 +260,7 @@ spu_fallback_fragment_ops(uint x, uint y, } /* - * Compute Src RGB terms + * Compute Src RGB terms (fragment color * factor) */ switch (spu.blend.rgb_src_factor) { case PIPE_BLENDFACTOR_ONE: @@ -283,13 +283,33 @@ spu_fallback_fragment_ops(uint x, uint y, term1g = spu_mul(fragG, fragA); term1b = spu_mul(fragB, fragA); break; + case PIPE_BLENDFACTOR_DST_COLOR: + term1r = spu_mul(fragR, fbRGBA[0]); + term1g = spu_mul(fragG, fbRGBA[1]); + term1b = spu_mul(fragB, fbRGBA[1]); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + term1r = spu_mul(fragR, fbRGBA[3]); + term1g = spu_mul(fragG, fbRGBA[3]); + term1b = spu_mul(fragB, fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0])); + term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1])); + term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2])); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3])); + term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3])); + term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3])); + break; /* XXX more cases */ default: ASSERT(0); } /* - * Compute Src Alpha term + * Compute Src Alpha term (fragment alpha * factor) */ switch (spu.blend.alpha_src_factor) { case PIPE_BLENDFACTOR_ONE: @@ -301,13 +321,23 @@ spu_fallback_fragment_ops(uint x, uint y, case PIPE_BLENDFACTOR_SRC_ALPHA: term1a = spu_mul(fragA, fragA); break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_DST_ALPHA: + term1a = spu_mul(fragA, fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3])); + break; /* XXX more cases */ default: ASSERT(0); } /* - * Compute Dest RGB terms + * Compute Dest RGB terms (framebuffer color * factor) */ switch (spu.blend.rgb_dst_factor) { case PIPE_BLENDFACTOR_ONE: @@ -337,17 +367,37 @@ spu_fallback_fragment_ops(uint x, uint y, term2g = spu_mul(fbRGBA[1], tmp); term2b = spu_mul(fbRGBA[2], tmp); break; - /* XXX more cases */ + case PIPE_BLENDFACTOR_DST_COLOR: + term2r = spu_mul(fbRGBA[0], fbRGBA[0]); + term2g = spu_mul(fbRGBA[1], fbRGBA[1]); + term2b = spu_mul(fbRGBA[2], fbRGBA[2]); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + term2r = spu_mul(fbRGBA[0], fbRGBA[3]); + term2g = spu_mul(fbRGBA[1], fbRGBA[3]); + term2b = spu_mul(fbRGBA[2], fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0])); + term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1])); + term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2])); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3])); + term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3])); + term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3])); + break; + /* XXX more cases */ default: ASSERT(0); } /* - * Compute Dest Alpha term + * Compute Dest Alpha term (framebuffer alpha * factor) */ switch (spu.blend.alpha_dst_factor) { case PIPE_BLENDFACTOR_ONE: - term2a = fragA; + term2a = fbRGBA[3]; break; case PIPE_BLENDFACTOR_SRC_COLOR: term2a = spu_splats(0.0f); @@ -360,6 +410,16 @@ spu_fallback_fragment_ops(uint x, uint y, tmp = spu_sub(one, fragA); term2a = spu_mul(fbRGBA[3], tmp); break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_DST_ALPHA: + term2a = spu_mul(fbRGBA[3], fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3])); + break; /* XXX more cases */ default: ASSERT(0); @@ -394,9 +454,7 @@ spu_fallback_fragment_ops(uint x, uint y, fragG = spu_max(term1g, term2g); fragB = spu_max(term1b, term2b); break; - /* XXX more cases */ default: - printf("unsup 0x%x\n", spu.blend.rgb_func); ASSERT(0); } @@ -419,7 +477,6 @@ spu_fallback_fragment_ops(uint x, uint y, case PIPE_BLEND_MAX: fragA = spu_max(term1a, term2a); break; - /* XXX more cases */ default: ASSERT(0); } -- cgit v1.2.3 From ddeec1ed10d6c12403fe8d30c072ea68f044db99 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 15 Oct 2008 13:55:18 -0600 Subject: cell: simplify spu debug code --- src/gallium/drivers/cell/common.h | 1 + src/gallium/drivers/cell/ppu/cell_context.c | 1 + src/gallium/drivers/cell/spu/spu_command.c | 47 +++++++++++++---------------- src/gallium/drivers/cell/spu/spu_debug.h | 9 ------ src/gallium/drivers/cell/spu/spu_main.c | 9 +----- src/gallium/drivers/cell/spu/spu_main.h | 15 +++++++-- src/gallium/drivers/cell/spu/spu_render.c | 7 +++-- 7 files changed, 41 insertions(+), 48 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 3b5a25e165..8ae78265f2 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -111,6 +111,7 @@ #define CELL_DEBUG_SYNC (1 << 2) #define CELL_DEBUG_FRAGMENT_OPS (1 << 3) #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4) +#define CELL_DEBUG_CMD (1 << 5) /** Max instructions for doing per-fragment operations */ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index b66aa9c9d9..f8d5eef3ac 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -93,6 +93,7 @@ static const struct debug_named_value cell_debug_flags[] = { {"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */ {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/ {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/ + {"cmd", CELL_DEBUG_CMD}, /**< SPUs dump command buffer info */ {NULL, 0} }; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index b521c3aecf..ebbed3d1dc 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -44,7 +44,6 @@ #include "spu_tile.h" #include "spu_vertex_shader.h" #include "spu_dcache.h" -#include "spu_debug.h" #include "cell/common.h" @@ -97,7 +96,7 @@ release_buffer(uint buffer) static void cmd_clear_surface(const struct cell_command_clear_surface *clear) { - DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); + D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); if (clear->surface == 0) { spu.fb.color_clear_value = clear->value; @@ -165,14 +164,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) #endif /* CLEAR_OPT */ - DEBUG_PRINTF("CLEAR SURF done\n"); + D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n"); } static void cmd_release_verts(const struct cell_command_release_verts *release) { - DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf); + D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf); ASSERT(release->vertex_buf != ~0U); release_buffer(release->vertex_buf); } @@ -189,7 +188,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) { static int warned = 0; - DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n"); + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); /* Copy state info (for fallback case only) */ @@ -229,7 +228,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) static void cmd_state_fragment_program(const struct cell_command_fragment_program *fp) { - DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n"); + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_program_code, fp->code, SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); @@ -247,11 +246,11 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos) const float *constants = (const float *) &buffer[pos + 2]; uint i; - DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const); + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const); /* Expand each float to float[4] for SOA execution */ for (i = 0; i < num_const; i++) { - DEBUG_PRINTF(" const[%u] = %f\n", i, constants[i]); + D_PRINTF(CELL_DEBUG_CMD, " const[%u] = %f\n", i, constants[i]); spu.constants[i] = spu_splats(constants[i]); } @@ -263,7 +262,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos) static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { - DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", + D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", cmd->width, cmd->height, cmd->color_start, @@ -352,7 +351,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler) { uint unit = sampler->unit; - DEBUG_PRINTF("SAMPLER [%u]\n", unit); + D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit); spu.sampler[unit] = sampler->state; @@ -404,9 +403,7 @@ cmd_state_texture(const struct cell_command_texture *texture) const uint unit = texture->unit; uint i; - //if (spu.init.id==0) Debug=1; - - DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit); + D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit); spu.texture[unit].max_level = 0; spu.texture[unit].target = texture->target; @@ -416,7 +413,7 @@ cmd_state_texture(const struct cell_command_texture *texture) uint height = texture->height[i]; uint depth = texture->depth[i]; - DEBUG_PRINTF(" LEVEL %u: at %p size[0] %u x %u\n", i, + D_PRINTF(CELL_DEBUG_CMD, " LEVEL %u: at %p size[0] %u x %u\n", i, texture->start[i], texture->width[i], texture->height[i]); spu.texture[unit].level[i].start = texture->start[i]; @@ -438,15 +435,13 @@ cmd_state_texture(const struct cell_command_texture *texture) } update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit); - - //Debug=0; } static void cmd_state_vertex_info(const struct vertex_info *vinfo) { - DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); + D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); ASSERT(vinfo->num_attribs >= 1); ASSERT(vinfo->num_attribs <= 8); memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); @@ -485,7 +480,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code) static void cmd_finish(void) { - DEBUG_PRINTF("FINISH\n"); + D_PRINTF(CELL_DEBUG_CMD, "FINISH\n"); really_clear_tiles(0); /* wait for all outstanding DMAs to finish */ mfc_write_tag_mask(~0); @@ -510,7 +505,7 @@ cmd_batch(uint opcode) const unsigned usize = size / sizeof(buffer[0]); uint pos; - DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n", + D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n", buf, size, spu.init.buffers[buf]); ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); @@ -530,7 +525,7 @@ cmd_batch(uint opcode) wait_on_mask(1 << TAG_BATCH_BUFFER); /* Tell PPU we're done copying the buffer to local store */ - DEBUG_PRINTF("release batch buf %u\n", buf); + D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf); release_buffer(buf); /* @@ -663,7 +658,7 @@ cmd_batch(uint opcode) } } - DEBUG_PRINTF("BATCH complete\n"); + D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n"); } @@ -677,7 +672,7 @@ command_loop(void) struct cell_command cmd; int exitFlag = 0; - DEBUG_PRINTF("Enter command loop\n"); + D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n"); ASSERT((sizeof(struct cell_command) & 0xf) == 0); ASSERT_ALIGN16(&cmd); @@ -686,12 +681,12 @@ command_loop(void) unsigned opcode; int tag = 0; - DEBUG_PRINTF("Wait for cmd...\n"); + D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n"); /* read/wait from mailbox */ opcode = (unsigned int) spu_read_in_mbox(); - DEBUG_PRINTF("got cmd 0x%x\n", opcode); + D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode); /* command payload */ mfc_get(&cmd, /* dest */ @@ -708,7 +703,7 @@ command_loop(void) switch (opcode & CELL_CMD_OPCODE_MASK) { case CELL_CMD_EXIT: - DEBUG_PRINTF("EXIT\n"); + D_PRINTF(CELL_DEBUG_CMD, "EXIT\n"); exitFlag = 1; break; case CELL_CMD_VS_EXECUTE: @@ -725,7 +720,7 @@ command_loop(void) } - DEBUG_PRINTF("Exit command loop\n"); + D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n"); spu_dcache_report(); } diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h index eeec052655..25653dcdcd 100644 --- a/src/gallium/drivers/cell/spu/spu_debug.h +++ b/src/gallium/drivers/cell/spu/spu_debug.h @@ -30,28 +30,19 @@ #define SPU_DEBUG_H -/* Set to 0 to disable all extraneous debugging code */ -#define DEBUG 1 - #if DEBUG -extern boolean Debug; -extern boolean force_fragment_ops_fallback; /* These debug macros use the unusual construction ", ##__VA_ARGS__" * which expands to the expected comma + args if variadic arguments * are supplied, but swallows the comma if there are no variadic * arguments (which avoids syntax errors that would otherwise occur). */ -#define DEBUG_PRINTF(format,...) \ - if (Debug) \ - printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) #define D_PRINTF(flag, format,...) \ if (spu.init.debug_flags & (flag)) \ printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) #else -#define DEBUG_PRINTF(...) #define D_PRINTF(...) #endif diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 4becd0f92a..c8bb251905 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -40,7 +40,6 @@ #include "spu_per_fragment_op.h" #include "spu_texture.h" //#include "spu_test.h" -#include "spu_debug.h" #include "cell/common.h" @@ -53,12 +52,6 @@ helpful headers: struct spu_global spu; -#if DEBUG -boolean Debug = FALSE; -boolean force_fragment_ops_fallback = TRUE; -#endif - - static void one_time_init(void) { @@ -102,7 +95,7 @@ main(main_param_t speid, main_param_t argp) one_time_init(); - DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid); + D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid); D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n"); /* get initialization data */ diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index ca72baea8b..569b9e45d4 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -36,6 +36,19 @@ #include "pipe/p_state.h" +#if DEBUG +/* These debug macros use the unusual construction ", ##__VA_ARGS__" + * which expands to the expected comma + args if variadic arguments + * are supplied, but swallows the comma if there are no variadic + * arguments (which avoids syntax errors that would otherwise occur). + */ +#define D_PRINTF(flag, format,...) \ + if (spu.init.debug_flags & (flag)) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) +#else +#define D_PRINTF(...) +#endif + #define MAX_WIDTH 1024 #define MAX_HEIGHT 1024 @@ -187,8 +200,6 @@ struct spu_global extern struct spu_global spu; -extern boolean Debug; - diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 82dbeb26b7..cfff19b6c0 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -177,7 +177,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) uint i, j; - if (Debug) { +#if 0 printf("SPU %u: RENDER prim %u, num_vert=%u num_ind=%u " "inline_vert=%u\n", spu.init.id, @@ -190,7 +190,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) printf(" bound: %g, %g .. %g, %g\n", render->xmin, render->ymin, render->xmax, render->ymax); */ - } +#endif ASSERT(sizeof(*render) % 4 == 0); ASSERT(total_vertex_bytes % 16 == 0); @@ -293,7 +293,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) spu.ztile_status[ty][tx] = spu.cur_ztile_status; } - if (Debug) +#if 0 printf("SPU %u: RENDER done\n", spu.init.id); +#endif } -- cgit v1.2.3 From 79e96b3a77f7d5c7136b380abcc675c7242d0ffe Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 15 Oct 2008 13:58:58 -0600 Subject: cell: move some CELL_MAX constants --- src/gallium/drivers/cell/common.h | 6 +++++- src/gallium/drivers/cell/spu/spu_main.h | 11 ++--------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 8ae78265f2..d716a26175 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -68,6 +68,9 @@ #define CELL_MAX_SAMPLERS 4 #define CELL_MAX_TEXTURE_LEVELS 12 /* 2k x 2k */ +#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */ +#define CELL_MAX_WIDTH 1024 /**< max framebuffer width */ +#define CELL_MAX_HEIGHT 1024 /**< max framebuffer width */ #define TILE_SIZE 32 @@ -99,13 +102,14 @@ #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 - +/** Command/batch buffers */ #define CELL_NUM_BUFFERS 4 #define CELL_BUFFER_SIZE (4*1024) /**< 16KB would be the max */ #define CELL_BUFFER_STATUS_FREE 10 #define CELL_BUFFER_STATUS_USED 20 +/** Debug flags */ #define CELL_DEBUG_CHECKER (1 << 0) #define CELL_DEBUG_ASM (1 << 1) #define CELL_DEBUG_SYNC (1 << 2) diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 569b9e45d4..f87495b72d 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -50,13 +50,6 @@ #endif -#define MAX_WIDTH 1024 -#define MAX_HEIGHT 1024 - - -#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */ - - /** * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels. * The data may be addressed through several different types. @@ -175,8 +168,8 @@ struct spu_global ubyte cur_ctile_status, cur_ztile_status; /** Status of all tiles in framebuffer */ - ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; - ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; /** Current fragment ops machine code, at 8-byte boundary */ uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; -- cgit v1.2.3 From 0eb0b0a816764a323af7a8d2b5cb6792f886ce04 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 15 Oct 2008 14:12:55 -0600 Subject: cell: remove some old, pre-batchbuffer stuff --- src/gallium/drivers/cell/common.h | 14 -------------- src/gallium/drivers/cell/ppu/cell_spu.c | 5 +---- src/gallium/drivers/cell/ppu/cell_spu.h | 3 +-- src/gallium/drivers/cell/spu/spu_command.c | 19 ------------------- 4 files changed, 2 insertions(+), 39 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index d716a26175..600f1b37a2 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -269,19 +269,6 @@ struct cell_command_texture }; -/** XXX unions don't seem to work */ -/* XXX this should go away; all commands should be placed in batch buffers */ -struct cell_command -{ -#if 0 - struct cell_command_framebuffer fb; - struct cell_command_clear_surface clear; - struct cell_command_render render; -#endif - struct cell_command_vs vs; -} ALIGN16_ATTRIB; - - #define MAX_SPU_FUNCTIONS 12 /** * Used to tell the PPU about the address of particular functions in the @@ -302,7 +289,6 @@ struct cell_init_info unsigned id; unsigned num_spus; unsigned debug_flags; /**< mask of CELL_DEBUG_x flags */ - struct cell_command *cmd; /** Buffers for command batches, vertex/index data */ ubyte *buffers[CELL_NUM_BUFFERS]; diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c index df020c4146..90745da3d2 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.c +++ b/src/gallium/drivers/cell/ppu/cell_spu.c @@ -126,9 +126,6 @@ cell_start_spus(struct cell_context *cell) assert(cell->num_spus <= MAX_SPUS); - ASSERT_ALIGN16(&cell_global.command[0]); - ASSERT_ALIGN16(&cell_global.command[1]); - ASSERT_ALIGN16(&cell_global.inits[0]); ASSERT_ALIGN16(&cell_global.inits[1]); @@ -141,7 +138,7 @@ cell_start_spus(struct cell_context *cell) cell_global.inits[i].id = i; cell_global.inits[i].num_spus = cell->num_spus; cell_global.inits[i].debug_flags = cell->debug_flags; - cell_global.inits[i].cmd = &cell_global.command[i]; + for (j = 0; j < CELL_NUM_BUFFERS; j++) { cell_global.inits[i].buffers[j] = cell->buffer[j]; } diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h index 137f26612e..3443331b01 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.h +++ b/src/gallium/drivers/cell/ppu/cell_spu.h @@ -50,10 +50,9 @@ struct cell_global_info pthread_t spe_threads[MAX_SPUS]; /** - * Data sent to SPUs + * Data sent to SPUs at start-up */ struct cell_init_info inits[MAX_SPUS]; - struct cell_command command[MAX_SPUS]; }; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index ebbed3d1dc..4febd5385b 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -669,38 +669,19 @@ cmd_batch(uint opcode) void command_loop(void) { - struct cell_command cmd; int exitFlag = 0; D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n"); - ASSERT((sizeof(struct cell_command) & 0xf) == 0); - ASSERT_ALIGN16(&cmd); - while (!exitFlag) { unsigned opcode; - int tag = 0; D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n"); /* read/wait from mailbox */ opcode = (unsigned int) spu_read_in_mbox(); - D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode); - /* command payload */ - mfc_get(&cmd, /* dest */ - (unsigned int) spu.init.cmd, /* src */ - sizeof(struct cell_command), /* bytes */ - tag, - 0, /* tid */ - 0 /* rid */); - wait_on_mask( 1 << tag ); - - /* - * NOTE: most commands should be contained in a batch buffer - */ - switch (opcode & CELL_CMD_OPCODE_MASK) { case CELL_CMD_EXIT: D_PRINTF(CELL_DEBUG_CMD, "EXIT\n"); -- cgit v1.2.3 From 67f615681c569264eab1bc901473c86cfc54e480 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 15 Oct 2008 14:18:51 -0600 Subject: cell: use CELL_MAX_SPUS consistently. --- src/gallium/drivers/cell/common.h | 2 +- src/gallium/drivers/cell/ppu/cell_spu.c | 2 +- src/gallium/drivers/cell/ppu/cell_spu.h | 8 +++----- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 600f1b37a2..1f6f2d494b 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -64,7 +64,7 @@ #define ROUNDUP16(k) (((k) + 0xf) & ~0xf) -#define CELL_MAX_SPUS 6 +#define CELL_MAX_SPUS 8 #define CELL_MAX_SAMPLERS 4 #define CELL_MAX_TEXTURE_LEVELS 12 /* 2k x 2k */ diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c index 90745da3d2..a6e268b362 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.c +++ b/src/gallium/drivers/cell/ppu/cell_spu.c @@ -124,7 +124,7 @@ cell_start_spus(struct cell_context *cell) one_time_init = TRUE; - assert(cell->num_spus <= MAX_SPUS); + assert(cell->num_spus <= CELL_MAX_SPUS); ASSERT_ALIGN16(&cell_global.inits[0]); ASSERT_ALIGN16(&cell_global.inits[1]); diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h index 3443331b01..2e965c6301 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.h +++ b/src/gallium/drivers/cell/ppu/cell_spu.h @@ -36,8 +36,6 @@ #include "cell_context.h" -#define MAX_SPUS 8 - /** * Global vars, for now anyway. */ @@ -46,13 +44,13 @@ struct cell_global_info /** * SPU/SPE handles, etc */ - spe_context_ptr_t spe_contexts[MAX_SPUS]; - pthread_t spe_threads[MAX_SPUS]; + spe_context_ptr_t spe_contexts[CELL_MAX_SPUS]; + pthread_t spe_threads[CELL_MAX_SPUS]; /** * Data sent to SPUs at start-up */ - struct cell_init_info inits[MAX_SPUS]; + struct cell_init_info inits[CELL_MAX_SPUS]; }; -- cgit v1.2.3 From ec7d6c656178babdf143faa242f7a3df9d0bc22c Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 15 Oct 2008 14:39:16 -0600 Subject: cell: send rasterizer state to SPUs in proper way, remove front_winding hack --- src/gallium/drivers/cell/common.h | 18 ++++++++++++++---- src/gallium/drivers/cell/ppu/cell_state_emit.c | 7 +++++++ src/gallium/drivers/cell/ppu/cell_vbuf.c | 1 - src/gallium/drivers/cell/spu/spu_command.c | 8 ++++++++ src/gallium/drivers/cell/spu/spu_main.h | 1 + src/gallium/drivers/cell/spu/spu_render.c | 2 +- src/gallium/drivers/cell/spu/spu_tri.c | 4 ++-- src/gallium/drivers/cell/spu/spu_tri.h | 2 +- 8 files changed, 34 insertions(+), 9 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 1f6f2d494b..0ff2c491fb 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -99,8 +99,9 @@ #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 #define CELL_CMD_STATE_FS_CONSTANTS 21 -#define CELL_CMD_VS_EXECUTE 22 -#define CELL_CMD_FLUSH_BUFFER_RANGE 23 +#define CELL_CMD_STATE_RASTERIZER 22 +#define CELL_CMD_VS_EXECUTE 23 +#define CELL_CMD_FLUSH_BUFFER_RANGE 24 /** Command/batch buffers */ #define CELL_NUM_BUFFERS 4 @@ -156,13 +157,23 @@ struct cell_command_fragment_program */ struct cell_command_framebuffer { - uint64_t opcode; /**< CELL_CMD_FRAMEBUFFER */ + uint64_t opcode; /**< CELL_CMD_STATE_FRAMEBUFFER */ int width, height; void *color_start, *depth_start; enum pipe_format color_format, depth_format; }; +/** + * Tell SPUs about rasterizer state. + */ +struct cell_command_rasterizer +{ + uint64_t opcode; /**< CELL_CMD_STATE_RASTERIZER */ + struct pipe_rasterizer_state rasterizer; +}; + + /** * Clear framebuffer to the given value/color. */ @@ -238,7 +249,6 @@ struct cell_command_render float xmin, ymin, xmax, ymax; /* XXX another dummy field */ uint min_index; boolean inline_verts; - uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */ }; diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index d2427584ba..e6387382f2 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -147,6 +147,13 @@ cell_emit_state(struct cell_context *cell) #endif } + if (cell->dirty & (CELL_NEW_RASTERIZER)) { + struct cell_command_rasterizer *rast = + cell_batch_alloc(cell, sizeof(*rast)); + rast->opcode = CELL_CMD_STATE_RASTERIZER; + rast->rasterizer = *cell->rasterizer; + } + if (cell->dirty & (CELL_NEW_FS)) { /* Send new fragment program to SPUs */ struct cell_command_fragment_program *fp diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c index 578ddf62dc..aa63435b93 100644 --- a/src/gallium/drivers/cell/ppu/cell_vbuf.c +++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c @@ -214,7 +214,6 @@ cell_vbuf_draw(struct vbuf_render *vbr, render->opcode = CELL_CMD_RENDER; render->prim_type = cvbr->prim; - render->front_winding = cell->rasterizer->front_winding; render->num_indexes = nr_indices; render->min_index = min_index; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index 4febd5385b..d2c282a022 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -583,6 +583,14 @@ cmd_batch(uint opcode) case CELL_CMD_STATE_FS_CONSTANTS: pos = cmd_state_fs_constants(buffer, pos); break; + case CELL_CMD_STATE_RASTERIZER: + { + struct cell_command_rasterizer *rast = + (struct cell_command_rasterizer *) &buffer[pos]; + spu.rasterizer = rast->rasterizer; + pos += sizeof(*rast) / 8; + } + break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index f87495b72d..4099e52699 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -153,6 +153,7 @@ struct spu_global struct pipe_blend_state blend; struct pipe_blend_color blend_color; struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; + struct pipe_rasterizer_state rasterizer; struct spu_texture texture[PIPE_MAX_SAMPLERS]; struct vertex_info vertex_info; diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index cfff19b6c0..75a7f75abc 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) v1 = (const float *) (vertices + indexes[j+1] * vertex_size); v2 = (const float *) (vertices + indexes[j+2] * vertex_size); - drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding); + drawn += tri_draw(v0, v1, v2, tx, ty); } //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3); diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 2417db8960..1519b8cd7e 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -775,7 +775,7 @@ determinant(const float *v0, const float *v1, const float *v2) */ boolean tri_draw(const float *v0, const float *v1, const float *v2, - uint tx, uint ty, uint front_winding) + uint tx, uint ty) { setup.tx = tx; setup.ty = ty; @@ -790,7 +790,7 @@ tri_draw(const float *v0, const float *v1, const float *v2, * which will be needed for front/back-face stencil application */ float det = determinant(v0, v1, v2); - setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW); + setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW); if (!setup_sort_vertices((struct vertex_header *) v0, (struct vertex_header *) v1, diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h index abc3d35160..aa694dd7c9 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.h +++ b/src/gallium/drivers/cell/spu/spu_tri.h @@ -31,7 +31,7 @@ extern boolean -tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding); +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty); #endif /* SPU_TRI_H */ -- cgit v1.2.3 From 0116ee1d1c341726b6ed23c2dddc4515e8a34385 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 15 Oct 2008 20:46:43 -0600 Subject: cell: start some performance measurements Use the spu_write_decrementer() and spu_read_decrementer() functions to measure time. Convert to milliseconds according to the system timebase value. --- src/gallium/drivers/cell/common.h | 1 + src/gallium/drivers/cell/ppu/cell_spu.c | 31 ++++++++++++++++++++++++++++++ src/gallium/drivers/cell/spu/spu_command.c | 15 +++++++++++++++ src/gallium/drivers/cell/spu/spu_render.c | 9 ++++++++- 4 files changed, 55 insertions(+), 1 deletion(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 0ff2c491fb..469d56cda8 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -299,6 +299,7 @@ struct cell_init_info unsigned id; unsigned num_spus; unsigned debug_flags; /**< mask of CELL_DEBUG_x flags */ + float inv_timebase; /**< 1.0/timebase, for perf measurement */ /** Buffers for command batches, vertex/index data */ ubyte *buffers[CELL_NUM_BUFFERS]; diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c index a6e268b362..28e5e6d706 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.c +++ b/src/gallium/drivers/cell/ppu/cell_spu.c @@ -52,6 +52,35 @@ helpful headers: struct cell_global_info cell_global; +/** + * Scan /proc/cpuinfo to determine the timebase for the system. + * This is used by the SPUs to convert 'decrementer' ticks to seconds. + * There may be a better way to get this value... + */ +static unsigned +get_timebase(void) +{ + FILE *f = fopen("/proc/cpuinfo", "r"); + unsigned timebase; + + assert(f); + while (!feof(f)) { + char line[80]; + fgets(line, sizeof(line), f); + if (strncmp(line, "timebase", 8) == 0) { + char *colon = strchr(line, ':'); + if (colon) { + timebase = atoi(colon + 2); + break; + } + } + } + fclose(f); + + return timebase; +} + + /** * Write a 1-word message to the given SPE mailbox. */ @@ -115,6 +144,7 @@ cell_start_spus(struct cell_context *cell) { static boolean one_time_init = FALSE; uint i, j; + uint timebase = get_timebase(); if (one_time_init) { fprintf(stderr, "PPU: Multiple rendering contexts not yet supported " @@ -138,6 +168,7 @@ cell_start_spus(struct cell_context *cell) cell_global.inits[i].id = i; cell_global.inits[i].num_spus = cell->num_spus; cell_global.inits[i].debug_flags = cell->debug_flags; + cell_global.inits[i].inv_timebase = 1000.0f / timebase; for (j = 0; j < CELL_NUM_BUFFERS; j++) { cell_global.inits[i].buffers[j] = cell->buffer[j]; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index d2c282a022..57d265fef7 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -670,6 +670,8 @@ cmd_batch(uint opcode) } +#define PERF 0 + /** * Main loop for SPEs: Get a command, execute it, repeat. @@ -678,6 +680,7 @@ void command_loop(void) { int exitFlag = 0; + uint t0, t1; D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n"); @@ -686,10 +689,16 @@ command_loop(void) D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n"); + if (PERF) + spu_write_decrementer(~0); + /* read/wait from mailbox */ opcode = (unsigned int) spu_read_in_mbox(); D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode); + if (PERF) + t0 = spu_read_decrementer(); + switch (opcode & CELL_CMD_OPCODE_MASK) { case CELL_CMD_EXIT: D_PRINTF(CELL_DEBUG_CMD, "EXIT\n"); @@ -707,6 +716,12 @@ command_loop(void) printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK); } + if (PERF) { + t1 = spu_read_decrementer(); + printf("wait mbox time: %gms batch time: %gms\n", + (~0u - t0) * spu.init.inv_timebase, + (t0 - t1) * spu.init.inv_timebase); + } } D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n"); diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 802455bf79..5515bb55c9 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -175,6 +175,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) const ubyte *vertices; const ushort *indexes; uint i, j; + uint num_tiles; D_PRINTF(CELL_DEBUG_CMD, "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n", @@ -242,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */ + num_tiles = 0; + /** ** loop over tiles, rendering tris **/ @@ -255,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) if (!my_tile(tx, ty)) continue; + num_tiles++; + spu.cur_ctile_status = spu.ctile_status[ty][tx]; spu.cur_ztile_status = spu.ztile_status[ty][tx]; @@ -284,5 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) spu.ztile_status[ty][tx] = spu.cur_ztile_status; } - D_PRINTF(CELL_DEBUG_CMD, "RENDER done\n"); + D_PRINTF(CELL_DEBUG_CMD, + "RENDER done (%u tiles hit)\n", + num_tiles); } -- cgit v1.2.3 From 9fa8671c73fa44a95e2ea7fed6047bddb042796f Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 16 Oct 2008 20:25:28 -0600 Subject: cell: add new debug flag (cache) to report texture cache stats on exit --- src/gallium/drivers/cell/common.h | 1 + src/gallium/drivers/cell/ppu/cell_context.c | 1 + src/gallium/drivers/cell/spu/spu_command.c | 3 ++- src/gallium/drivers/cell/spu/spu_dcache.c | 4 +++- 4 files changed, 7 insertions(+), 2 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 469d56cda8..9ca4e9d67e 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -117,6 +117,7 @@ #define CELL_DEBUG_FRAGMENT_OPS (1 << 3) #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4) #define CELL_DEBUG_CMD (1 << 5) +#define CELL_DEBUG_CACHE (1 << 6) /** Max instructions for doing per-fragment operations */ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 4dad490ce1..7a2d93ecb4 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -94,6 +94,7 @@ static const struct debug_named_value cell_debug_flags[] = { {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/ {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/ {"cmd", CELL_DEBUG_CMD}, /**< SPUs dump command buffer info */ + {"cache", CELL_DEBUG_CACHE}, /**< report texture cache stats on exit */ {NULL, 0} }; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index ff4a52d79a..9c853c0961 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -720,5 +720,6 @@ command_loop(void) D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n"); - spu_dcache_report(); + if (spu.init.debug_flags & CELL_DEBUG_CACHE) + spu_dcache_report(); } diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c index 167404cdc5..a6d67634fd 100644 --- a/src/gallium/drivers/cell/spu/spu_dcache.c +++ b/src/gallium/drivers/cell/spu/spu_dcache.c @@ -36,7 +36,9 @@ #define CACHE_SET_TAGID(set) (((set) & 0x03) + TAG_DCACHE0) #define CACHE_LOG2NNWAY 2 #define CACHE_LOG2NSETS 6 -/*#define CACHE_STATS 1*/ +#ifdef DEBUG +#define CACHE_STATS 1 +#endif #include /* Yes folks, this is ugly. -- cgit v1.2.3 From 70dd4379d2cd54f229c3940312537912470218d3 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 22 Oct 2008 10:34:13 -0600 Subject: cell: implement fencing for texture buffers If we delete a texture, we need to keep the underlying tiled data buffer around until any rendering that references it has completed. Keep a list of buffers referenced by a rendering batch. Unref/free them when the associated batch's fence is executed/signalled. --- src/gallium/drivers/cell/common.h | 25 ++++ src/gallium/drivers/cell/ppu/Makefile | 1 + src/gallium/drivers/cell/ppu/cell_batch.c | 32 +++++ src/gallium/drivers/cell/ppu/cell_context.c | 6 + src/gallium/drivers/cell/ppu/cell_context.h | 21 ++++ src/gallium/drivers/cell/ppu/cell_fence.c | 158 +++++++++++++++++++++++++ src/gallium/drivers/cell/ppu/cell_fence.h | 57 +++++++++ src/gallium/drivers/cell/ppu/cell_state_emit.c | 2 +- src/gallium/drivers/cell/ppu/cell_texture.c | 33 ++++-- src/gallium/drivers/cell/ppu/cell_texture.h | 5 +- src/gallium/drivers/cell/ppu/cell_vbuf.c | 6 + src/gallium/drivers/cell/spu/spu_command.c | 38 +++++- src/gallium/drivers/cell/spu/spu_main.h | 2 +- 13 files changed, 367 insertions(+), 19 deletions(-) create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.c create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.h (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 9ca4e9d67e..23fb0b0831 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -102,6 +102,8 @@ #define CELL_CMD_STATE_RASTERIZER 22 #define CELL_CMD_VS_EXECUTE 23 #define CELL_CMD_FLUSH_BUFFER_RANGE 24 +#define CELL_CMD_FENCE 25 + /** Command/batch buffers */ #define CELL_NUM_BUFFERS 4 @@ -123,6 +125,29 @@ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 + +#define CELL_FENCE_IDLE 0 +#define CELL_FENCE_EMITTED 1 +#define CELL_FENCE_SIGNALLED 2 + +struct cell_fence +{ + /** There's a 16-byte status qword per SPU */ + volatile uint status[CELL_MAX_SPUS][4]; +}; + + +/** + * Fence command sent to SPUs. In response, the SPUs will write + * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory. + */ +struct cell_command_fence +{ + uint64_t opcode; /**< CELL_CMD_FENCE */ + struct cell_fence *fence; +}; + + /** * Command to specify per-fragment operations state and generated code. * Note that the dsa, blend, blend_color fields are really only needed diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index b28f4c5c31..9358a47284 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -24,6 +24,7 @@ SOURCES = \ cell_clear.c \ cell_context.c \ cell_draw_arrays.c \ + cell_fence.c \ cell_flush.c \ cell_gen_fragment.c \ cell_gen_fp.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c index 01254aed60..448b723d85 100644 --- a/src/gallium/drivers/cell/ppu/cell_batch.c +++ b/src/gallium/drivers/cell/ppu/cell_batch.c @@ -28,6 +28,7 @@ #include "cell_context.h" #include "cell_batch.h" +#include "cell_fence.h" #include "cell_spu.h" @@ -63,6 +64,10 @@ cell_get_empty_buffer(struct cell_context *cell) printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries); */ prev_buffer = buf; + + /* release tex buffer associated w/ prev use of this batch buf */ + cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]); + return buf; } } @@ -84,6 +89,26 @@ cell_get_empty_buffer(struct cell_context *cell) } +/** + * Append a fence command to the current batch buffer. + * Note that we're sure there's always room for this because of the + * adjusted size check in cell_batch_free_space(). + */ +static void +emit_fence(struct cell_context *cell) +{ + const uint batch = cell->cur_batch; + const uint size = cell->buffer_size[batch]; + struct cell_command_fence *fence_cmd; + + ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE); + + fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size); + fence_cmd->opcode = CELL_CMD_FENCE; + fence_cmd->fence = &cell->fenced_buffers[batch].fence; +} + + /** * Flush the current batch buffer to the SPUs. * An empty buffer will be found and set as the new current batch buffer @@ -102,6 +127,12 @@ cell_batch_flush(struct cell_context *cell) if (size == 0) return; + /* Before we use this batch buffer, make sure any fenced texture buffers + * are released. + */ + if (cell->fenced_buffers[batch].head) + emit_fence(cell); + flushing = TRUE; assert(batch < CELL_NUM_BUFFERS); @@ -142,6 +173,7 @@ uint cell_batch_free_space(const struct cell_context *cell) { uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch]; + free -= sizeof(struct cell_command_fence); return free; } diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 7a2d93ecb4..22d552d8e3 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -47,6 +47,7 @@ #include "cell_clear.h" #include "cell_context.h" #include "cell_draw_arrays.h" +#include "cell_fence.h" #include "cell_flush.h" #include "cell_state.h" #include "cell_surface.h" @@ -104,6 +105,7 @@ cell_create_context(struct pipe_screen *screen, struct cell_winsys *cws) { struct cell_context *cell; + uint i; /* some fields need to be 16-byte aligned, so align the whole object */ cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16); @@ -151,6 +153,10 @@ cell_create_context(struct pipe_screen *screen, cell_debug_flags, 0 ); + for (i = 0; i < CELL_NUM_BUFFERS; i++) + cell_fence_init(&cell->fenced_buffers[i].fence); + + /* * SPU stuff */ diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index ad1f4829a4..4491ae8cdf 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -81,6 +81,19 @@ struct cell_fragment_ops_key }; +struct cell_buffer_node; + +/** + * Fenced buffer list. List of buffers which can be unreferenced after + * the fence has been executed/signalled. + */ +struct cell_buffer_list +{ + struct cell_fence fence; + struct cell_buffer_node *head; +}; + + /** * Per-context state, subclass of pipe_context. */ @@ -154,6 +167,14 @@ struct cell_context uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB; + /** Associated with each command/batch buffer is a list of pipe_buffers + * that are fenced. When the last command in a buffer is executed, the + * fence will be signalled, indicating that any pipe_buffers preceeding + * that fence can be unreferenced (and probably freed). + */ + struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS]; + + struct spe_function attrib_fetch; unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS]; diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c new file mode 100644 index 0000000000..ffb3bea12b --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_fence.c @@ -0,0 +1,158 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include +#include "util/u_memory.h" +#include "pipe/p_inlines.h" +#include "cell_context.h" +#include "cell_batch.h" +#include "cell_fence.h" +#include "cell_texture.h" + + +void +cell_fence_init(struct cell_fence *fence) +{ + uint i; + for (i = 0; i < CELL_MAX_SPUS; i++) { + fence->status[i][0] = CELL_FENCE_IDLE; + } +} + + +boolean +cell_fence_signalled(const struct cell_context *cell, + const struct cell_fence *fence) +{ + uint i; + for (i = 0; i < cell->num_spus; i++) { + //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE); + if (fence->status[i][0] == CELL_FENCE_EMITTED) + return FALSE; + } + return TRUE; +} + + +void +cell_fence_finish(const struct cell_context *cell, + const struct cell_fence *fence) +{ + while (!cell_fence_signalled(cell, fence)) { + usleep(10); + } +} + + + + +struct cell_buffer_node +{ + struct pipe_buffer *buffer; + struct cell_buffer_node *next; +}; + + +static void +cell_add_buffer_to_list(struct cell_context *cell, + struct cell_buffer_list *list, + struct pipe_buffer *buffer) +{ + struct pipe_screen *ps = cell->pipe.screen; + struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node); + /* create new list node which references the buffer, insert at head */ + if (node) { + pipe_buffer_reference(ps, &node->buffer, buffer); + node->next = list->head; + list->head = node; + } +} + + +/** + * Wait for completion of the given fence, then unreference any buffers + * on the list. + * This typically unrefs/frees texture buffers after any rendering which uses + * them has completed. + */ +void +cell_free_fenced_buffers(struct cell_context *cell, + struct cell_buffer_list *list) +{ + if (list->head) { + struct pipe_screen *ps = cell->pipe.screen; + struct cell_buffer_node *node; + + cell_fence_finish(cell, &list->fence); + + /* traverse the list, unreferencing buffers, freeing nodes */ + node = list->head; + while (node) { + struct cell_buffer_node *next = node->next; + assert(node->buffer); + pipe_buffer_unmap(ps, node->buffer); +#if 0 + printf("Unref buffer %p\n", node->buffer); + if (node->buffer->refcount == 1) + printf(" Delete!\n"); +#endif + pipe_buffer_reference(ps, &node->buffer, NULL); + FREE(node); + node = next; + } + list->head = NULL; + } +} + + +/** + * This should be called for each render command. + * Any texture buffers that are current bound will be added to a fenced + * list to be freed later when the fence is executed/signalled. + */ +void +cell_add_fenced_textures(struct cell_context *cell) +{ + struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch]; + uint i; + + for (i = 0; i < cell->num_textures; i++) { + struct cell_texture *ct = cell->texture[i]; + if (ct) { + uint level; + for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) { + if (ct->tiled_buffer[level]) { +#if 0 + printf("Adding texture %p buffer %p to list\n", + ct, ct->tiled_buffer[level]); +#endif + cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]); + } + } + } + } +} diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h new file mode 100644 index 0000000000..536b4ba411 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_fence.h @@ -0,0 +1,57 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_FENCE_H +#define CELL_FENCE_H + + +extern void +cell_fence_init(struct cell_fence *fence); + + +extern boolean +cell_fence_signalled(const struct cell_context *cell, + const struct cell_fence *fence); + + +extern void +cell_fence_finish(const struct cell_context *cell, + const struct cell_fence *fence); + + + +extern void +cell_free_fenced_buffers(struct cell_context *cell, + struct cell_buffer_list *list); + + +extern void +cell_add_fenced_textures(struct cell_context *cell); + + +#endif /* CELL_FENCE_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index effcd2a1e1..dd2d7f7d1e 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -225,7 +225,7 @@ cell_emit_state(struct cell_context *cell) if (cell->texture[i]) { uint level; for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) { - texture->start[level] = cell->texture[i]->tiled_data[level]; + texture->start[level] = cell->texture[i]->tiled_mapped[level]; texture->width[level] = cell->texture[i]->base.width[level]; texture->height[level] = cell->texture[i]->base.height[level]; texture->depth[level] = cell->texture[i]->base.depth[level]; diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 9c6741f1bc..9ac2f3bbb9 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -136,6 +136,9 @@ cell_texture_release(struct pipe_screen *screen, __FUNCTION__, (void *) *pt, (*pt)->refcount - 1); */ if (--(*pt)->refcount <= 0) { + /* Delete this texture now. + * But note that the underlying pipe_buffer may linger... + */ struct cell_texture *ct = cell_texture(*pt); uint i; @@ -146,14 +149,12 @@ cell_texture_release(struct pipe_screen *screen, pipe_buffer_reference(screen, &ct->buffer, NULL); for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { - if (ct->tiled_data[i]) { - /* XXX need to use a fenced buffer for tiled data so that - * it's properly freed after rendering has completed. - * Disabling this free() allows glDrawPixels to work for now. - */ -#if 0 - align_free(ct->tiled_data[i]); -#endif + /* Unreference the tiled image buffer. + * It may not actually be deleted until a fence is hit. + */ + if (ct->tiled_buffer[i]) { + ct->tiled_mapped[i] = NULL; + winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL); } } @@ -234,12 +235,18 @@ cell_twiddle_texture(struct pipe_screen *screen, int offset = bufWidth * bufHeight * 4 * surface->face; uint *dst; - if (!ct->tiled_data[level]) { - ct->tiled_data[level] = - align_malloc(bufWidth * bufHeight * 4 * numFaces, 16); + if (!ct->tiled_buffer[level]) { + /* allocate buffer for tiled data now */ + struct pipe_winsys *ws = screen->winsys; + uint bytes = bufWidth * bufHeight * 4 * numFaces; + ct->tiled_buffer[level] = ws->buffer_create(ws, 16, + PIPE_BUFFER_USAGE_PIXEL, + bytes); + /* and map it */ + ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level], + PIPE_BUFFER_USAGE_GPU_READ); } - - dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset); + dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset); twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, surface->stride, src); diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h index a0757091b0..2f5fe0dd1b 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.h +++ b/src/gallium/drivers/cell/ppu/cell_texture.h @@ -48,7 +48,10 @@ struct cell_texture struct pipe_buffer *buffer; unsigned long buffer_size; - void *tiled_data[CELL_MAX_TEXTURE_LEVELS]; /* XXX this may be temporary */ /*ALIGN16*/ + /** Texture data in tiled layout is held here */ + struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS]; + /** Mapped, tiled texture data */ + void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c index aa63435b93..65ba51b6bb 100644 --- a/src/gallium/drivers/cell/ppu/cell_vbuf.c +++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c @@ -38,6 +38,7 @@ #include "cell_batch.h" #include "cell_context.h" +#include "cell_fence.h" #include "cell_flush.h" #include "cell_spu.h" #include "cell_vbuf.h" @@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, __FUNCTION__, cvbr->vertex_buf, vertices_used); */ + /* Make sure texture buffers aren't released until we're done rendering + * with them. + */ + cell_add_fenced_textures(cell); + /* Tell SPUs they can release the vert buf */ if (cvbr->vertex_buf != ~0U) { struct cell_command_release_verts *release diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index 9c853c0961..a6ed29ea63 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -76,9 +76,10 @@ static void release_buffer(uint buffer) { /* Evidently, using less than a 16-byte status doesn't work reliably */ - static const uint status[4] ALIGN16_ATTRIB - = {CELL_BUFFER_STATUS_FREE, 0, 0, 0}; - + static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE}; const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer); uint *dst = spu.init.buffer_status + index; @@ -93,6 +94,29 @@ release_buffer(uint buffer) } +/** + * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory. + * There's a qword of status per SPU. + */ +static void +cmd_fence(struct cell_command_fence *fence_cmd) +{ + static const vector unsigned int status = {CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED}; + uint *dst = (uint *) fence_cmd->fence; + dst += 4 * spu.init.id; /* main store/memory address, not local store */ + + mfc_put((void *) &status, /* src in local memory */ + (unsigned int) dst, /* dst in main memory */ + sizeof(status), /* size */ + TAG_FENCE, /* tag */ + 0, /* tid */ + 0 /* rid */); +} + + static void cmd_clear_surface(const struct cell_command_clear_surface *clear) { @@ -637,6 +661,14 @@ cmd_batch(uint opcode) cmd_finish(); pos += 1; break; + case CELL_CMD_FENCE: + { + struct cell_command_fence *fence_cmd = + (struct cell_command_fence *) &buffer[pos]; + cmd_fence(fence_cmd); + pos += sizeof(*fence_cmd) / 8; + } + break; case CELL_CMD_RELEASE_VERTS: { struct cell_command_release_verts *release diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 95ef4c9244..668af10be2 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -210,7 +210,7 @@ extern struct spu_global spu; #define TAG_DCACHE1 21 #define TAG_DCACHE2 22 #define TAG_DCACHE3 23 - +#define TAG_FENCE 24 static INLINE void -- cgit v1.2.3 From b493fdd7e333b9a94176a603009643326a538690 Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Fri, 7 Nov 2008 11:29:07 -0700 Subject: CELL: fix several stencil problems This small set of changes repairs several different stenciling problems; now redbook/stencil also runs correctly (and maybe others - I haven't checked everything yet). - The number of instructions that had been allocated for fragment ops used to be 64 (in cell/common.h). With complicated stencil use, we managed to get up to 93, which caused a segfault before we noticed we'd overran our memory buffer. It's now been bumped to 128, which should be enough for even complicated stencil and fragment op usage. - The status of cell surfaces never changed beyond the initial PIPE_SURFACE_STATUS_UNDEFINED. When a user called glClear() to clear just the Z buffer (but not the stencil buffer), this caused the check_clear_depth_with_quad() function to return false (because the surface status was believed to be undefined), and so the device was instructed to clear the whole buffer (including the stencil buffer), instead of correctly using a quad to clear just the depth, leaving the stencil alone. This has been fixed similarly to the way the i915 driver handles the surface status: during cell_clear_surface(), the status is set to PIPE_SURFACE_STATUS_DEFINED. Then a partial buffer clear is handled with a quad, as expected. Note that we are *not* using PIPE_SURFACE_STATUS_CLEAR (also similar to the i915); technically, we should be setting the surface status to CLEAR on a clear, and to DEFINED when we actually draw something (say on cell_vbuf_draw()), but it's difficult to figure out exactly which surfaces are affected by a cell_vbuf_draw(), so for now we're doing the easy thing. - The fragment ops handling was very clever about only pulling out the parts of the Z/stencil buffer that it needed for calculations; but this failed when only part of the buffer was written, because the part that was never pulled out was inadvertently cleared. Now all the data from the combined Z/stencil buffer is pulled out, just so the proper values can be recombined later and written back to the buffer correctly. As a bonus, the fragment op code generation is simplified. --- src/gallium/drivers/cell/common.h | 2 +- src/gallium/drivers/cell/ppu/cell_clear.c | 13 ++ src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 153 ++++++++++------------- 3 files changed, 80 insertions(+), 88 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 23fb0b0831..87488ea2d7 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -122,7 +122,7 @@ #define CELL_DEBUG_CACHE (1 << 6) /** Max instructions for doing per-fragment operations */ -#define SPU_MAX_FRAGMENT_OPS_INSTS 64 +#define SPU_MAX_FRAGMENT_OPS_INSTS 128 diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c index c9c0c721bb..037635e466 100644 --- a/src/gallium/drivers/cell/ppu/cell_clear.c +++ b/src/gallium/drivers/cell/ppu/cell_clear.c @@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, clr->surface = surfIndex; clr->value = clearValue; } + + /* Technically, the surface's contents are now known and cleared, + * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But + * it turns out it's quite painful to recognize when any particular + * surface goes from PIPE_SURFACE_STATUS_CLEAR to + * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because + * the drawing commands could be operating on numerous draw buffers, + * which we'd have to iterate through to set all their stati... + * For now, we cheat a bit and set the surface's status to DEFINED + * right here. Later we should revisit this and set the status to + * CLEAR here, and find a better place to set the status to DEFINED. + */ + ps->status = PIPE_SURFACE_STATUS_DEFINED; } diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 6e2a5d2980..d9c3ff3f4d 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1997,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) * Z and/or stencil. We'll also convert the incoming fragment Z * value in fragZ_reg from a floating point value in [0.0..1.0] to * an unsigned integer value with the appropriate resolution. + * Note that even if depth or stencil is *not* enabled, if it's + * present in the buffer, we pull it out and put it back later; + * otherwise, we can inadvertently destroy the contents of + * buffers we're not supposed to touch (e.g., if the user is + * clearing the depth buffer but not the stencil buffer, a + * quad of constant depth is drawn over the surface; the stencil + * buffer must be maintained). */ switch(zs_format) { case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ case PIPE_FORMAT_X8Z24_UNORM: - if (dsa->depth.enabled) { - /* We need the Z part at least */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* four 24-bit Z values in the low-order bits */ - spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* four 8-bit Z values in the high-order bits */ - spe_rotmi(f, fbS_reg, fbZS_reg, -24); - } - break; + /* Pull out both Z and stencil */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* four 24-bit Z values in the low-order bits */ + spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* four 8-bit stencil values in the high-order bits */ + spe_rotmi(f, fbS_reg, fbZS_reg, -24); + break; case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ case PIPE_FORMAT_Z24X8_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* shift by 8 to get the upper 24-bit values */ - spe_rotmi(f, fbS_reg, fbZS_reg, -8); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* 8-bit stencil in the low-order bits - mask them out */ - spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); - } - break; + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* shift by 8 to get the upper 24-bit values */ + spe_rotmi(f, fbS_reg, fbZS_reg, -8); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* 8-bit stencil in the low-order bits - mask them out */ + spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); + break; case PIPE_FORMAT_Z32_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 32-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - } + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 32-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); /* No stencil, so can't do anything there */ - break; + break; case PIPE_FORMAT_Z16_UNORM: - if (dsa->depth.enabled) { - /* XXX Not sure this is correct, but it was here before, so we're - * going with it for now - */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 16-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -16); - } + /* XXX Not sure this is correct, but it was here before, so we're + * going with it for now + */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 16-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); /* No stencil */ - break; default: ASSERT(0); /* invalid format */ @@ -2118,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_comment(f, 0, "Store quad's depth/stencil values in tile"); if (zs_format == PIPE_FORMAT_S8Z24_UNORM || zs_format == PIPE_FORMAT_X8Z24_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - } - else { - spe_move(f, fbZS_reg, fbZ_reg); - } + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || zs_format == PIPE_FORMAT_Z24X8_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_move(f, fbZS_reg, fbS_reg); - } - else { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - } + spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z32_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_S8_UNORM) { ASSERT(0); /* XXX to do */ @@ -2163,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); } + /* Don't need these any more */ release_optional_register(f, &fbZ_reg_set, fbZ_reg); release_optional_register(f, &fbS_reg_set, fbS_reg); } -- cgit v1.2.3 From 90027f85786406133a5180998a75fb612b6a221e Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Tue, 11 Nov 2008 13:57:10 -0700 Subject: CELL: two-sided stencil fixes With these changes, the tests/stencil_twoside test now works. - Eliminate blending from the stencil_twoside test, as it produces an unneeded dependency on having blending working - The spe_splat() function will now work if the register being splatted and the destination register are the same - Separate fragment code generated for front-facing and back-facing fragments. Often these are the same; if two-sided stenciling is on, they can be different. This is easier and faster than generating code that does both tests and merges the results. - Fixed a cut/paste bug where if the back Z-pass stencil operation were different from all the other operations, the back Z-fail results were incorrect. --- progs/tests/stencil_twoside.c | 2 - src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 7 +- src/gallium/drivers/cell/common.h | 6 +- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 239 ++++++--------------- src/gallium/drivers/cell/ppu/cell_gen_fragment.h | 2 +- src/gallium/drivers/cell/ppu/cell_state_emit.c | 19 +- src/gallium/drivers/cell/spu/spu_command.c | 6 +- src/gallium/drivers/cell/spu/spu_main.c | 6 +- src/gallium/drivers/cell/spu/spu_main.h | 10 +- src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 3 +- src/gallium/drivers/cell/spu/spu_per_fragment_op.h | 3 +- src/gallium/drivers/cell/spu/spu_tri.c | 20 +- 12 files changed, 115 insertions(+), 208 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/progs/tests/stencil_twoside.c b/progs/tests/stencil_twoside.c index be9d9a776a..8826c46fc2 100644 --- a/progs/tests/stencil_twoside.c +++ b/progs/tests/stencil_twoside.c @@ -115,7 +115,6 @@ static void Display( void ) glVertex2f(-1, 1); glEnd(); - if (use20syntax) { stencil_func_separate(GL_FRONT, GL_ALWAYS, 0, ~0); stencil_func_separate(GL_BACK, GL_ALWAYS, 0, ~0); @@ -279,7 +278,6 @@ static void Init( void ) stencil_op_separate = glutGetProcAddress( "glStencilOpSeparate" ); printf("\nAll 5 squares should be the same color.\n"); - glEnable( GL_BLEND ); } diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index f8568f690b..1bd9f1c8dd 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -958,9 +958,12 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig void spe_splat(struct spe_function *p, unsigned rT, unsigned rA) { + /* Use a temporary, just in case rT == rA */ + unsigned int tmp_reg = spe_allocate_available_register(p); /* Duplicate bytes 0, 1, 2, and 3 across the whole register */ - spe_ila(p, rT, 0x00010203); - spe_shufb(p, rT, rA, rA, rT); + spe_ila(p, tmp_reg, 0x00010203); + spe_shufb(p, rT, rA, rA, tmp_reg); + spe_release_register(p, tmp_reg); } diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 87488ea2d7..a670ed3c6e 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -130,6 +130,9 @@ #define CELL_FENCE_EMITTED 1 #define CELL_FENCE_SIGNALLED 2 +#define CELL_FACING_FRONT 0 +#define CELL_FACING_BACK 1 + struct cell_fence { /** There's a 16-byte status qword per SPU */ @@ -160,7 +163,8 @@ struct cell_command_fragment_ops struct pipe_depth_stencil_alpha_state dsa; struct pipe_blend_state blend; struct pipe_blend_color blend_color; - unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS]; + unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS]; + unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index d9c3ff3f4d..6e425eafaa 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1412,144 +1412,72 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op, * and released by the corresponding spe_release_register_set() call. */ static void -gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa, +gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil, + const unsigned int depth_enabled, unsigned int fbS_reg, unsigned int *fail_reg, unsigned int *zfail_reg, - unsigned int *zpass_reg, unsigned int *back_fail_reg, - unsigned int *back_zfail_reg, unsigned int *back_zpass_reg) + unsigned int *zpass_reg) { - unsigned zfail_op, back_zfail_op; + unsigned zfail_op; /* Stenciling had better be enabled here */ - ASSERT(dsa->stencil[0].enabled); + ASSERT(stencil->enabled); /* If the depth test is not enabled, it is treated as though it always - * passes. In particular, that means that the "zfail_op" (and the backfacing - * counterpart, if active) are not considered - a failing stencil test will - * trigger the "fail_op", and a passing stencil test will trigger the - * "zpass_op". + * passes, which means that the zfail_op is not considered - a + * failing stencil test triggers the fail_op, and a passing one + * triggers the zpass_op * - * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP, - * we keep them from being calculated. + * As an optimization, override calculation of the zfail_op values + * if they aren't going to be used. By setting the value of + * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed + * to match the incoming stencil values, and no calculation will + * be done. */ - if (dsa->depth.enabled) { - zfail_op = dsa->stencil[0].zfail_op; - back_zfail_op = dsa->stencil[1].zfail_op; + if (depth_enabled) { + zfail_op = stencil->zfail_op; } else { zfail_op = PIPE_STENCIL_OP_KEEP; - back_zfail_op = PIPE_STENCIL_OP_KEEP; } /* One-sided or front-facing stencil */ - if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) { + if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) { *fail_reg = fbS_reg; } else { *fail_reg = spe_allocate_available_register(f); - gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, + gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 0xff, fbS_reg, *fail_reg); } + /* Check the possibly overridden value, not the structure value */ if (zfail_op == PIPE_STENCIL_OP_KEEP) { *zfail_reg = fbS_reg; } - else if (zfail_op == dsa->stencil[0].fail_op) { + else if (zfail_op == stencil->fail_op) { *zfail_reg = *fail_reg; } else { *zfail_reg = spe_allocate_available_register(f); - gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, + gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 0xff, fbS_reg, *zfail_reg); } - if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) { + if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) { *zpass_reg = fbS_reg; } - else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) { + else if (stencil->zpass_op == stencil->fail_op) { *zpass_reg = *fail_reg; } - else if (dsa->stencil[0].zpass_op == zfail_op) { + else if (stencil->zpass_op == zfail_op) { *zpass_reg = *zfail_reg; } else { *zpass_reg = spe_allocate_available_register(f); - gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, + gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 0xff, fbS_reg, *zpass_reg); } - - /* If two-sided stencil is enabled, we have more work to do. */ - if (!dsa->stencil[1].enabled) { - /* This just flags that the registers need not be deallocated later */ - *back_fail_reg = fbS_reg; - *back_zfail_reg = fbS_reg; - *back_zpass_reg = fbS_reg; - } - else { - /* Same calculations as above, but for the back stencil */ - if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) { - *back_fail_reg = fbS_reg; - } - else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) { - *back_fail_reg = *fail_reg; - } - else if (dsa->stencil[1].fail_op == zfail_op) { - *back_fail_reg = *zfail_reg; - } - else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) { - *back_fail_reg = *zpass_reg; - } - else { - *back_fail_reg = spe_allocate_available_register(f); - gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, - 0xff, fbS_reg, *back_fail_reg); - } - - if (back_zfail_op == PIPE_STENCIL_OP_KEEP) { - *back_zfail_reg = fbS_reg; - } - else if (back_zfail_op == dsa->stencil[0].fail_op) { - *back_zfail_reg = *fail_reg; - } - else if (back_zfail_op == zfail_op) { - *back_zfail_reg = *zfail_reg; - } - else if (back_zfail_op == dsa->stencil[0].zpass_op) { - *back_zfail_reg = *zpass_reg; - } - else if (back_zfail_op == dsa->stencil[1].fail_op) { - *back_zfail_reg = *back_fail_reg; - } - else { - *back_zfail_reg = spe_allocate_available_register(f); - gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, - 0xff, fbS_reg, *back_zfail_reg); - } - - if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) { - *back_zpass_reg = fbS_reg; - } - else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) { - *back_zpass_reg = *fail_reg; - } - else if (dsa->stencil[1].zpass_op == zfail_op) { - *back_zpass_reg = *zfail_reg; - } - else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) { - *back_zpass_reg = *zpass_reg; - } - else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) { - *back_zpass_reg = *back_fail_reg; - } - else if (dsa->stencil[1].zpass_op == back_zfail_op) { - *back_zpass_reg = *back_zfail_reg; - } - else { - *back_zfail_reg = spe_allocate_available_register(f); - gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, - 0xff, fbS_reg, *back_zpass_reg); - } - } /* End of calculations for back-facing stencil */ } /* Note that fbZ_reg may *not* be set on entry, if in fact @@ -1559,7 +1487,7 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_a static boolean gen_stencil_depth_test(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa, - const int const facing_reg, + const uint facing, const int mask_reg, const int fragZ_reg, const int fbZ_reg, const int fbS_reg) { @@ -1571,6 +1499,8 @@ gen_stencil_depth_test(struct spe_function *f, boolean need_to_calculate_stencil_values; boolean need_to_writemask_stencil_values; + struct pipe_stencil_state *stencil; + /* Registers. We may or may not actually allocate these, depending * on whether the state values indicate that we need them. */ @@ -1598,6 +1528,20 @@ gen_stencil_depth_test(struct spe_function *f, spe_comment(f, 0, "Allocating stencil register set"); spe_allocate_register_set(f); + /* The facing we're given is the fragment facing; it doesn't + * exactly match the stencil facing. If stencil is enabled, + * but two-sided stencil is *not* enabled, we use the same + * stencil settings for both front- and back-facing fragments. + * We only use the "back-facing" stencil for backfacing fragments + * if two-sided stenciling is enabled. + */ + if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) { + stencil = &dsa->stencil[1]; + } + else { + stencil = &dsa->stencil[0]; + } + /* Calculate the writemask. If the writemask is trivial (either * all 0s, meaning that we don't need to calculate any stencil values * because they're not going to change the stencil anyway, or all 1s, @@ -1608,24 +1552,20 @@ gen_stencil_depth_test(struct spe_function *f, * Note that if the backface stencil is *not* enabled, the backface * stencil will have the same values as the frontface stencil. */ - if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP && - dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP && - dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP && - dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP && - dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP && - dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) { - /* No changes to any stencil values */ + if (stencil->fail_op == PIPE_STENCIL_OP_KEEP && + stencil->zfail_op == PIPE_STENCIL_OP_KEEP && + stencil->zpass_op == PIPE_STENCIL_OP_KEEP) { need_to_calculate_stencil_values = false; need_to_writemask_stencil_values = false; } - else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) { + else if (stencil->write_mask == 0x0) { /* All changes are writemasked out, so no need to calculate * what those changes might be, and no need to write anything back. */ need_to_calculate_stencil_values = false; need_to_writemask_stencil_values = false; } - else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) { + else if (stencil->write_mask == 0xff) { /* Still trivial, but a little less so. We need to write the stencil * values, but we don't need to mask them. */ @@ -1645,14 +1585,7 @@ gen_stencil_depth_test(struct spe_function *f, */ spe_comment(f, 0, "Computing stencil writemask"); stencil_writemask_reg = spe_allocate_available_register(f); - spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask); - if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) { - unsigned int back_write_mask_reg = spe_allocate_available_register(f); - spe_comment(f, 0, "Resolving two-sided stencil writemask"); - spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask); - spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg); - spe_release_register(f, back_write_mask_reg); - } + spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask); } /* At least one-sided stenciling must be on. Generate code that @@ -1666,19 +1599,7 @@ gen_stencil_depth_test(struct spe_function *f, */ spe_comment(f, 0, "Running basic stencil test"); stencil_pass_reg = spe_allocate_available_register(f); - gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg); - - /* If two-sided stenciling is on, generate code to run the stencil - * test on the backfacing stencil as well, and combine the two results - * into the one correct result based on facing. - */ - if (dsa->stencil[1].enabled) { - unsigned int temp_reg = spe_allocate_available_register(f); - spe_comment(f, 0, "Running backface stencil test"); - gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg); - spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg); - spe_release_register(f, temp_reg); - } + gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg); /* Generate code that, given the mask of valid fragments and the * mask of valid fragments that passed the stencil test, computes @@ -1698,9 +1619,6 @@ gen_stencil_depth_test(struct spe_function *f, /* We may not need to calculate stencil values, if the writemask is off */ if (need_to_calculate_stencil_values) { - unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values; - unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values; - /* Generate code that calculates exactly which stencil values we need, * without calculating the same value twice (say, if two different * stencil ops have the same value). This code will work for one-sided @@ -1715,51 +1633,11 @@ gen_stencil_depth_test(struct spe_function *f, * This function will allocate a variant number of registers that * will be released as part of the register set. */ - spe_comment(f, 0, "Computing stencil values"); - gen_get_stencil_values(f, dsa, fbS_reg, - &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, - &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, - &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values); - - /* Tricky, tricky, tricky - the things we do to create optimal - * code... - * - * The various stencil values registers may overlap with each other - * and with fbS_reg arbitrarily (as any particular operation is - * only calculated once and stored in one register, no matter - * how many times it is used). So we can't change the values - * within those registers directly - if we change a value in a - * register that's being referenced by two different calculations, - * we've just unwittingly changed the second value as well... - * - * Avoid this by allocating new registers to hold the results - * (there may be 2, if the depth test is off, or 3, if it is on). - * These will be released as part of the register set. - */ - if (!dsa->stencil[1].enabled) { - /* The easy case: if two-sided stenciling is *not* enabled, we - * just use the front-sided values. - */ - stencil_fail_values = front_stencil_fail_values; - stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values; - stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values; - } - else { /* two-sided stencil enabled */ - spe_comment(f, 0, "Resolving backface stencil values"); - /* Allocate new registers for the needed merged values */ - stencil_fail_values = spe_allocate_available_register(f); - spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg); - if (dsa->depth.enabled) { - stencil_pass_depth_fail_values = spe_allocate_available_register(f); - spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg); - } - else { - stencil_pass_depth_fail_values = fbS_reg; - } - stencil_pass_depth_pass_values = spe_allocate_available_register(f); - spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg); - } - } + spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values"); + gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, + &stencil_fail_values, &stencil_pass_depth_fail_values, + &stencil_pass_depth_pass_values); + } /* We now have all the stencil values we need. We also need * the results of the depth test to figure out which @@ -1896,10 +1774,12 @@ gen_stencil_depth_test(struct spe_function *f, * should be much faster. * * \param cell the rendering context (in) + * \param facing whether the generated code is for front-facing or + * back-facing fragments * \param f the generated function (out) */ void -cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) +cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f) { const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil; const struct pipe_blend_state *blend = cell->blend; @@ -1917,7 +1797,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) const int fragB_reg = 10; /* vector float */ const int fragA_reg = 11; /* vector float */ const int mask_reg = 12; /* vector uint */ - const int facing_reg = 13; /* uint */ + + ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK); /* offset of quad from start of tile * XXX assuming 4-byte pixels for color AND Z/stencil!!!! @@ -1945,7 +1826,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_allocate_register(f, fragB_reg); spe_allocate_register(f, fragA_reg); spe_allocate_register(f, mask_reg); - spe_allocate_register(f, facing_reg); quad_offset_reg = spe_allocate_available_register(f); fbRGBA_reg = spe_allocate_available_register(f); @@ -1969,6 +1849,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_release_register(f, y2_reg); } + /* Generate the alpha test, if needed. */ if (dsa->alpha.enabled) { gen_alpha_test(dsa, f, mask_reg, fragA_reg); } @@ -2095,7 +1976,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) * gen_stencil_depth_test() function must ignore the * fbZ_reg register if depth is not enabled. */ - write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg); + write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg); } else if (dsa->depth.enabled) { int zmask_reg = spe_allocate_available_register(f); diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h index b59de198dc..2fabfdfb08 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h @@ -31,7 +31,7 @@ extern void -cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f); +cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f); #endif /* CELL_GEN_FRAGMENT_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index dd2d7f7d1e..031b27f11f 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -75,23 +75,29 @@ lookup_fragment_ops(struct cell_context *cell) * If not found, create/save new fragment ops command. */ if (!ops) { - struct spe_function spe_code; + struct spe_function spe_code_front, spe_code_back; if (0) debug_printf("**** Create New Fragment Ops\n"); /* Prepare the buffer that will hold the generated code. */ - spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); - /* generate new code */ - cell_gen_fragment_function(cell, &spe_code); + /* generate new code. Always generate new code for both front-facing + * and back-facing fragments, even if it's the same code in both + * cases. + */ + cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front); + cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back); /* alloc new fragment ops command */ ops = CALLOC_STRUCT(cell_command_fragment_ops); /* populate the new cell_command_fragment_ops object */ ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; - memcpy(ops->code, spe_code.store, spe_code_size(&spe_code)); + memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front)); + memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back)); ops->dsa = *cell->depth_stencil; ops->blend = *cell->blend; @@ -99,7 +105,8 @@ lookup_fragment_ops(struct cell_context *cell) util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL); /* release rtasm buffer */ - spe_release_func(&spe_code); + spe_release_func(&spe_code_front); + spe_release_func(&spe_code_back); } else { if (0) diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index d726622d94..d5faf4e3aa 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -214,7 +214,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n"); /* Copy SPU code from batch buffer to spu buffer */ - memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4); /* Copy state info (for fallback case only) */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); @@ -234,7 +235,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) * raw state records that the fallback code requires. */ if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) { - spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; + spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front; + spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back; } else { /* otherwise, the default fallback code remains in place */ diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index c8bb251905..7033f6037d 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -63,7 +63,8 @@ one_time_init(void) * This will normally be overriden by a code-gen'd function * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set. */ - spu.fragment_ops = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; } @@ -90,7 +91,8 @@ main(main_param_t speid, main_param_t argp) ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4); ASSERT(sizeof(struct cell_command_render) % 8 == 0); - ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0); + ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0); + ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0); ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0); one_time_init(); diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 692790c9f3..24cf7d77ce 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -85,8 +85,7 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y, vector float fragGreen, vector float fragBlue, vector float fragAlpha, - vector unsigned int mask, - uint facing); + vector unsigned int mask); /** Function for running fragment program */ typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs, @@ -170,9 +169,10 @@ struct spu_global ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; /** Current fragment ops machine code, at 8-byte boundary */ - uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; - /** Current fragment ops function */ - spu_fragment_ops_func fragment_ops; + uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; + uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; + /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */ + spu_fragment_ops_func fragment_ops[2]; /** Current fragment program machine code, at 8-byte boundary */ uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index f8ffc70492..683664e8a4 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -75,8 +75,7 @@ spu_fallback_fragment_ops(uint x, uint y, vector float fragG, vector float fragB, vector float fragA, - vector unsigned int mask, - uint facing) + vector unsigned int mask) { vector float frag_aos[4]; unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */ diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h index a61689c83a..f817abf046 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -38,8 +38,7 @@ spu_fallback_fragment_ops(uint x, uint y, vector float fragGreen, vector float fragBlue, vector float fragAlpha, - vector unsigned int mask, - uint facing); + vector unsigned int mask); #endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 5f908159bb..22e51a86ae 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -275,15 +275,20 @@ emit_quad( int x, int y, mask_t mask) /* Execute per-fragment/quad operations, including: * alpha test, z test, stencil test, blend and framebuffer writing. + * Note that there are two different fragment operations functions + * that can be called, one for front-facing fragments, and one + * for back-facing fragments. (Often the two are the same; + * but in some cases, like two-sided stenciling, they can be + * very different.) So choose the correct function depending + * on the calculated facing. */ - spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile, + spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile, fragZ, outputs[0*4+0], outputs[0*4+1], outputs[0*4+2], outputs[0*4+3], - mask, - setup.facing); + mask); } } } @@ -519,7 +524,14 @@ setup_sort_vertices(const struct vertex_header *v0, setup.oneOverArea = 1.0f / area; - /* The product of area * sign indicates front/back orientation (0/1) */ + /* The product of area * sign indicates front/back orientation (0/1). + * Just in case someone gets the bright idea of switching the front + * and back constants without noticing that we're assuming their + * values in this operation, also assert that the values are + * what we think they are. + */ + ASSERT(CELL_FACING_FRONT == 0); + ASSERT(CELL_FACING_BACK == 1); setup.facing = (area * sign > 0.0f) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW); -- cgit v1.2.3 From 11fc390f6478526d4f0bdb4b7e628284da31b3b9 Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Fri, 21 Nov 2008 11:42:14 -0700 Subject: CELL: use variant-length fragment ops programs This is a set of changes that optimizes the memory use of fragment operation programs (by using and transmitting only as much memory as is needed for the fragment ops programs, instead of maximal sizes), as well as eliminate the dependency on hard-coded maximal program sizes. State that is not dependent on fragment facing (i.e. that isn't using two-sided stenciling) will only save and transmit a single fragment operation program, instead of two identical programs. - Added the ability to emit a LNOP (No Operation (Load)) instruction. This is used to pad the generated fragment operations programs to a multiple of 8 bytes, which is necessary for proper operation of the dual instruction pipeline, and also required for proper SPU-side decoding. - Added the ability to allocate and manage a variant-length struct cell_command_fragment_ops. This structure now puts the generated function field at the end, where it can be as large as necessary. - On the PPU side, we now combine the generated front-facing and back-facing code into a single variant-length buffer (and only use one if the two sets of code are identical) for transmission to the SPU. - On the SPU side, we pull the correct sizes out of the buffer, allocate a new code buffer if the one we have isn't large enough, and save the code to that buffer. The buffer is deallocated when the SPU exits. - Commented out the emit_fetch() static function, which was not being used. --- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 7 +- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 11 ++- src/gallium/auxiliary/util/u_memory.h | 2 + src/gallium/drivers/cell/common.h | 31 +++++-- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 7 +- src/gallium/drivers/cell/ppu/cell_state_emit.c | 77 ++++++++++++++-- src/gallium/drivers/cell/ppu/cell_vertex_fetch.c | 3 + src/gallium/drivers/cell/spu/spu_command.c | 111 +++++++++++++++++------ src/gallium/drivers/cell/spu/spu_command.h | 32 ++++++- src/gallium/drivers/cell/spu/spu_main.c | 15 +-- src/gallium/drivers/cell/spu/spu_main.h | 4 +- 11 files changed, 232 insertions(+), 68 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index 1bd9f1c8dd..b9a75ae559 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -341,7 +341,11 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, } - +#define EMIT(_name, _op) \ +void _name (struct spe_function *p) \ +{ \ + emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \ +} #define EMIT_(_name, _op) \ void _name (struct spe_function *p, unsigned rT) \ @@ -713,7 +717,6 @@ hbrr; #if 0 stop; EMIT_RR (spe_stopd, 0x140); -EMIT_ (spe_lnop, 0x001); EMIT_ (spe_nop, 0x201); sync; EMIT_ (spe_dsync, 0x003); diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index 7c211ffc51..f9ad2acacd 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -99,7 +99,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s); #endif /* RTASM_PPC_SPE_H */ -#ifndef EMIT_ +#ifndef EMIT +#define EMIT(_name, _op) \ + extern void _name (struct spe_function *p); #define EMIT_(_name, _op) \ extern void _name (struct spe_function *p, unsigned rT); #define EMIT_R(_name, _op) \ @@ -129,7 +131,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s); #define EMIT_I16(_name, _op) \ extern void _name (struct spe_function *p, int imm); #define UNDEF_EMIT_MACROS -#endif /* EMIT_ */ +#endif /* EMIT */ /* Memory load / store instructions @@ -294,6 +296,10 @@ EMIT_RI16(spe_brz, 0x040) EMIT_RI16(spe_brhnz, 0x046) EMIT_RI16(spe_brhz, 0x044) +/* Control instructions + */ +EMIT (spe_lnop, 0x001) + extern void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset); @@ -418,6 +424,7 @@ EMIT_R (spe_wrch, 0x10d) #ifdef UNDEF_EMIT_MACROS +#undef EMIT #undef EMIT_ #undef EMIT_R #undef EMIT_RR diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h index 857102719d..1a6b596421 100644 --- a/src/gallium/auxiliary/util/u_memory.h +++ b/src/gallium/auxiliary/util/u_memory.h @@ -151,6 +151,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size ) #define CALLOC_STRUCT(T) (struct T *) CALLOC(1, sizeof(struct T)) +#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size) ((struct T *) CALLOC(1, sizeof(struct T) + more_size)) + /** * Return memory on given byte alignment diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index a670ed3c6e..98554d7f52 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -121,11 +121,6 @@ #define CELL_DEBUG_CMD (1 << 5) #define CELL_DEBUG_CACHE (1 << 6) -/** Max instructions for doing per-fragment operations */ -#define SPU_MAX_FRAGMENT_OPS_INSTS 128 - - - #define CELL_FENCE_IDLE 0 #define CELL_FENCE_EMITTED 1 #define CELL_FENCE_SIGNALLED 2 @@ -153,18 +148,36 @@ struct cell_command_fence /** * Command to specify per-fragment operations state and generated code. - * Note that the dsa, blend, blend_color fields are really only needed + * Note that this is a variant-length structure, allocated with as + * much memory as needed to hold the generated code; the "code" + * field *must* be the last field in the structure. Also, the entire + * length of the structure (including the variant code field) must be + * a multiple of 8 bytes; we require that this structure itself be + * a multiple of 8 bytes, and that the generated code also be a multiple + * of 8 bytes. + * + * Also note that the dsa, blend, blend_color fields are really only needed * for the fallback/C per-pixel code. They're not used when we generate - * dynamic SPU fragment code (which is the normal case). + * dynamic SPU fragment code (which is the normal case), and will eventually + * be removed from this structure. */ struct cell_command_fragment_ops { uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ + + /* Fields for the fallback case */ struct pipe_depth_stencil_alpha_state dsa; struct pipe_blend_state blend; struct pipe_blend_color blend_color; - unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS]; - unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS]; + + /* Fields for the generated SPU code */ + unsigned total_code_size; + unsigned front_code_index; + unsigned back_code_index; + /* this field has variant length, and must be the last field in + * the structure + */ + unsigned code[0]; }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 82336d6635..2c64eb1bcc 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1776,7 +1776,10 @@ gen_stencil_depth_test(struct spe_function *f, * \param cell the rendering context (in) * \param facing whether the generated code is for front-facing or * back-facing fragments - * \param f the generated function (out) + * \param f the generated function (in/out); on input, the function + * must already have been initialized. On exit, whatever + * instructions within the generated function have had + * the fragment ops appended. */ void cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f) @@ -1808,8 +1811,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */ int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */ - spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); - if (cell->debug_flags & CELL_DEBUG_ASM) { spe_print_code(f, true); spe_indent(f, 8); diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 031b27f11f..0a0af81f53 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -76,30 +76,86 @@ lookup_fragment_ops(struct cell_context *cell) */ if (!ops) { struct spe_function spe_code_front, spe_code_back; + unsigned int facing_dependent, total_code_size; if (0) debug_printf("**** Create New Fragment Ops\n"); - /* Prepare the buffer that will hold the generated code. */ - spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); - spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + /* Prepare the buffer that will hold the generated code. The + * "0" passed in for the size means that the SPE code will + * use a default size. + */ + spe_init_func(&spe_code_front, 0); + spe_init_func(&spe_code_back, 0); - /* generate new code. Always generate new code for both front-facing + /* Generate new code. Always generate new code for both front-facing * and back-facing fragments, even if it's the same code in both * cases. */ cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front); cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back); - /* alloc new fragment ops command */ - ops = CALLOC_STRUCT(cell_command_fragment_ops); + /* Make sure the code is a multiple of 8 bytes long; this is + * required to ensure that the dual pipe instruction alignment + * is correct. It's also important for the SPU unpacking, + * which assumes 8-byte boundaries. + */ + unsigned int front_code_size = spe_code_size(&spe_code_front); + while (front_code_size % 8 != 0) { + spe_lnop(&spe_code_front); + front_code_size = spe_code_size(&spe_code_front); + } + unsigned int back_code_size = spe_code_size(&spe_code_back); + while (back_code_size % 8 != 0) { + spe_lnop(&spe_code_back); + back_code_size = spe_code_size(&spe_code_back); + } + /* Determine whether the code we generated is facing-dependent, by + * determining whether the generated code is different for the front- + * and back-facing fragments. + */ + if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) { + /* Code is identical; only need one copy. */ + facing_dependent = 0; + total_code_size = front_code_size; + } + else { + /* Code is different for front-facing and back-facing fragments. + * Need to send both copies. + */ + facing_dependent = 1; + total_code_size = front_code_size + back_code_size; + } + + /* alloc new fragment ops command. Note that this structure + * has variant length based on the total code size required. + */ + ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size); /* populate the new cell_command_fragment_ops object */ ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; - memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front)); - memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back)); + ops->total_code_size = total_code_size; + ops->front_code_index = 0; + memcpy(ops->code, spe_code_front.store, front_code_size); + if (facing_dependent) { + /* We have separate front- and back-facing code. Append the + * back-facing code to the buffer. Be careful because the code + * size is in bytes, but the buffer is of unsigned elements. + */ + ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]); + memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size); + } + else { + /* Use the same code for front- and back-facing fragments */ + ops->back_code_index = ops->front_code_index; + } + + /* Set the fields for the fallback case. Note that these fields + * (and the whole fallback case) will eventually go away. + */ ops->dsa = *cell->depth_stencil; ops->blend = *cell->blend; + ops->blend_color = cell->blend_color; /* insert cell_command_fragment_ops object into keymap/cache */ util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL); @@ -200,9 +256,10 @@ cell_emit_state(struct cell_context *cell) CELL_NEW_DEPTH_STENCIL | CELL_NEW_BLEND)) { struct cell_command_fragment_ops *fops, *fops_cmd; - fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd)); + /* Note that cell_command_fragment_ops is a variant-sized record */ fops = lookup_fragment_ops(cell); - memcpy(fops_cmd, fops, sizeof(*fops)); + fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size); + memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size); } if (cell->dirty & CELL_NEW_SAMPLER) { diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c index 18969005b0..9cba537d9e 100644 --- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c +++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c @@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p, } +#if 0 +/* This appears to not be used currently */ static void emit_fetch(struct spe_function *p, unsigned in_ptr, unsigned *offset, @@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p, spe_release_register(p, float_one); } } +#endif void cell_update_vertex_fetch(struct draw_context *draw) diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index d5faf4e3aa..8500d19754 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -210,45 +210,72 @@ cmd_release_verts(const struct cell_command_release_verts *release) static void cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) { - static int warned = 0; - D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n"); - /* Copy SPU code from batch buffer to spu buffer */ - memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4); - memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4); - /* Copy state info (for fallback case only) */ + + /* Copy state info (for fallback case only - this will eventually + * go away when the fallback case goes away) + */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color)); - /* Parity twist! For now, always use the fallback code by default, - * only switching to codegen when specifically requested. This - * allows us to develop freely without risking taking down the - * branch. - * - * Later, the parity of this check will be reversed, so that - * codegen is *always* used, unless we specifically indicate that - * we don't want it. - * - * Eventually, the option will be removed completely, because in - * final code we'll always use codegen and won't even provide the - * raw state records that the fallback code requires. + /* Make sure the SPU knows which buffers it's expected to read when + * it's told to pull tiles. */ - if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) { - spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front; - spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back; - } - else { - /* otherwise, the default fallback code remains in place */ + spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled); + + /* If we're forcing the fallback code to be used (for debug purposes), + * install that. Otherwise install the incoming SPU code. + */ + if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) { + static unsigned int warned = 0; if (!warned) { fprintf(stderr, "Cell Warning: using fallback per-fragment code\n"); warned = 1; } + /* The following two lines aren't really necessary if you + * know the debug flags won't change during a run, and if you + * know that the function pointers are initialized correctly. + * We set them here to allow a person to change the debug + * flags during a run (from inside a debugger). + */ + spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; + return; } - spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled); -} + /* Make sure the SPU code buffer is large enough to hold the incoming code. + * Note that we *don't* use align_malloc() and align_free(), because + * those utility functions are *not* available in SPU code. + * */ + if (spu.fragment_ops_code_size < fops->total_code_size) { + if (spu.fragment_ops_code != NULL) { + free(spu.fragment_ops_code); + } + spu.fragment_ops_code_size = fops->total_code_size; + spu.fragment_ops_code = malloc(fops->total_code_size); + if (spu.fragment_ops_code == NULL) { + /* Whoops. */ + fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size); + spu.fragment_ops_code = NULL; + spu.fragment_ops_code_size = 0; + spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; + return; + } + } + /* Copy the SPU code from the command buffer to the spu buffer */ + memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size); + + /* Set the pointers for the front-facing and back-facing fragments + * to the specified offsets within the code. Note that if the + * front-facing and back-facing code are the same, they'll have + * the same offset. + */ + spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index]; + spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index]; +} static void cmd_state_fragment_program(const struct cell_command_fragment_program *fp) @@ -588,7 +615,8 @@ cmd_batch(uint opcode) struct cell_command_fragment_ops *fops = (struct cell_command_fragment_ops *) &buffer[pos]; cmd_state_fragment_ops(fops); - pos += sizeof(*fops) / 8; + /* This is a variant-sized command */ + pos += (sizeof(*fops) + fops->total_code_size)/ 8; } break; case CELL_CMD_STATE_FRAGMENT_PROGRAM: @@ -756,3 +784,32 @@ command_loop(void) if (spu.init.debug_flags & CELL_DEBUG_CACHE) spu_dcache_report(); } + +/* Initialize this module; we manage the fragment ops buffer here. */ +void +spu_command_init(void) +{ + /* Install default/fallback fragment processing function. + * This will normally be overriden by a code-gen'd function + * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set. + */ + spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; + + /* Set up the basic empty buffer for code-gen'ed fragment ops */ + spu.fragment_ops_code = NULL; + spu.fragment_ops_code_size = 0; +} + +void +spu_command_close(void) +{ + /* Deallocate the code-gen buffer for fragment ops, and reset the + * fragment ops functions to their initial setting (just to leave + * things in a good state). + */ + if (spu.fragment_ops_code != NULL) { + free(spu.fragment_ops_code); + } + spu_command_init(); +} diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h index 853e9aa549..83dcdade28 100644 --- a/src/gallium/drivers/cell/spu/spu_command.h +++ b/src/gallium/drivers/cell/spu/spu_command.h @@ -1,7 +1,35 @@ - - +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ extern void command_loop(void); +extern void +spu_command_init(void); +extern void +spu_command_close(void); diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 7033f6037d..97c86d194d 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -58,17 +58,8 @@ one_time_init(void) memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status)); memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status)); invalidate_tex_cache(); - - /* Install default/fallback fragment processing function. - * This will normally be overriden by a code-gen'd function - * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set. - */ - spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; - spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; } - - /* In some versions of the SDK the SPE main takes 'unsigned long' as a * parameter. In others it takes 'unsigned long long'. Use a define to * select between the two. @@ -91,11 +82,11 @@ main(main_param_t speid, main_param_t argp) ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4); ASSERT(sizeof(struct cell_command_render) % 8 == 0); - ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0); - ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0); + ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0); ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0); one_time_init(); + spu_command_init(); D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid); D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n"); @@ -120,5 +111,7 @@ main(main_param_t speid, main_param_t argp) command_loop(); + spu_command_close(); + return 0; } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 24cf7d77ce..33767e7c51 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -169,8 +169,8 @@ struct spu_global ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; /** Current fragment ops machine code, at 8-byte boundary */ - uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; - uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; + uint *fragment_ops_code; + uint fragment_ops_code_size; /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */ spu_fragment_ops_func fragment_ops[2]; -- cgit v1.2.3 From 402e6752b53d04af0bbfc5391547c2d127bce859 Mon Sep 17 00:00:00 2001 From: Jonathan Adamczewski Date: Mon, 12 Jan 2009 16:24:49 -0700 Subject: cell: allocate batch buffers w/ 16-byte alignment Replace cell_batch{align,alloc)*() with cell_batch_alloc16(), allocating multiples of 16 bytes that are 16 byte aligned. Opcodes are stored in preferred slot of SPU machine word. Various structures are explicitly padded to 16 byte multiples. Added STATIC_ASSERT(). --- src/gallium/drivers/cell/common.h | 43 +++++++++++---- src/gallium/drivers/cell/ppu/cell_batch.c | 73 ++++---------------------- src/gallium/drivers/cell/ppu/cell_batch.h | 9 +--- src/gallium/drivers/cell/ppu/cell_clear.c | 5 +- src/gallium/drivers/cell/ppu/cell_flush.c | 13 ++--- src/gallium/drivers/cell/ppu/cell_state_emit.c | 43 ++++++++------- src/gallium/drivers/cell/ppu/cell_vbuf.c | 16 +++--- src/gallium/drivers/cell/spu/spu_command.c | 52 +++++++++--------- 8 files changed, 111 insertions(+), 143 deletions(-) (limited to 'src/gallium/drivers/cell/common.h') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 98554d7f52..1f6860da11 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -49,6 +49,15 @@ } + +#define JOIN(x, y) JOIN_AGAIN(x, y) +#define JOIN_AGAIN(x, y) x ## y + +#define STATIC_ASSERT(e) \ +{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];} + + + /** for sanity checking */ #define ASSERT_ALIGN16(ptr) \ ASSERT((((unsigned long) (ptr)) & 0xf) == 0); @@ -134,6 +143,11 @@ struct cell_fence volatile uint status[CELL_MAX_SPUS][4]; }; +#ifdef __SPU__ +typedef vector unsigned int opcode_t; +#else +typedef unsigned int opcode_t[4]; +#endif /** * Fence command sent to SPUs. In response, the SPUs will write @@ -141,8 +155,9 @@ struct cell_fence */ struct cell_command_fence { - uint64_t opcode; /**< CELL_CMD_FENCE */ + opcode_t opcode; /**< CELL_CMD_FENCE */ struct cell_fence *fence; + uint32_t pad_[3]; }; @@ -163,7 +178,7 @@ struct cell_command_fence */ struct cell_command_fragment_ops { - uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ + opcode_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ /* Fields for the fallback case */ struct pipe_depth_stencil_alpha_state dsa; @@ -189,8 +204,9 @@ struct cell_command_fragment_ops */ struct cell_command_fragment_program { - uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */ + opcode_t opcode; /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */ uint num_inst; /**< Number of instructions */ + uint32_t pad[3]; unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; }; @@ -200,10 +216,11 @@ struct cell_command_fragment_program */ struct cell_command_framebuffer { - uint64_t opcode; /**< CELL_CMD_STATE_FRAMEBUFFER */ + opcode_t opcode; /**< CELL_CMD_STATE_FRAMEBUFFER */ int width, height; void *color_start, *depth_start; enum pipe_format color_format, depth_format; + uint32_t pad_[2]; }; @@ -212,7 +229,7 @@ struct cell_command_framebuffer */ struct cell_command_rasterizer { - uint64_t opcode; /**< CELL_CMD_STATE_RASTERIZER */ + opcode_t opcode; /**< CELL_CMD_STATE_RASTERIZER */ struct pipe_rasterizer_state rasterizer; }; @@ -222,9 +239,10 @@ struct cell_command_rasterizer */ struct cell_command_clear_surface { - uint64_t opcode; /**< CELL_CMD_CLEAR_SURFACE */ + opcode_t opcode; /**< CELL_CMD_CLEAR_SURFACE */ uint surface; /**< Temporary: 0=color, 1=Z */ uint value; + uint32_t pad[2]; }; @@ -271,7 +289,7 @@ struct cell_shader_info #define SPU_VERTS_PER_BATCH 64 struct cell_command_vs { - uint64_t opcode; /**< CELL_CMD_VS_EXECUTE */ + opcode_t opcode; /**< CELL_CMD_VS_EXECUTE */ uint64_t vOut[SPU_VERTS_PER_BATCH]; unsigned num_elts; unsigned elts[SPU_VERTS_PER_BATCH]; @@ -283,7 +301,7 @@ struct cell_command_vs struct cell_command_render { - uint64_t opcode; /**< CELL_CMD_RENDER */ + opcode_t opcode; /**< CELL_CMD_RENDER */ uint prim_type; /**< PIPE_PRIM_x */ uint num_verts; uint vertex_size; /**< bytes per vertex */ @@ -292,27 +310,30 @@ struct cell_command_render float xmin, ymin, xmax, ymax; /* XXX another dummy field */ uint min_index; boolean inline_verts; + uint32_t pad_[1]; }; struct cell_command_release_verts { - uint64_t opcode; /**< CELL_CMD_RELEASE_VERTS */ + opcode_t opcode; /**< CELL_CMD_RELEASE_VERTS */ uint vertex_buf; /**< in [0, CELL_NUM_BUFFERS-1] */ + uint32_t pad_[3]; }; struct cell_command_sampler { - uint64_t opcode; /**< CELL_CMD_STATE_SAMPLER */ + opcode_t opcode; /**< CELL_CMD_STATE_SAMPLER */ uint unit; struct pipe_sampler_state state; + uint32_t pad_[1]; }; struct cell_command_texture { - uint64_t opcode; /**< CELL_CMD_STATE_TEXTURE */ + opcode_t opcode; /**< CELL_CMD_STATE_TEXTURE */ uint target; /**< PIPE_TEXTURE_x */ uint unit; void *start[CELL_MAX_TEXTURE_LEVELS]; /**< Address in main memory */ diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c index 962775cd33..fe144f8b84 100644 --- a/src/gallium/drivers/cell/ppu/cell_batch.c +++ b/src/gallium/drivers/cell/ppu/cell_batch.c @@ -108,15 +108,16 @@ emit_fence(struct cell_context *cell) fence->status[i][0] = CELL_FENCE_EMITTED; } + STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0); + ASSERT(size % 16 == 0); ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE); fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size); - fence_cmd->opcode = CELL_CMD_FENCE; + fence_cmd->opcode[0] = CELL_CMD_FENCE; fence_cmd->fence = fence; /* update batch buffer size */ cell->buffer_size[batch] = size + sizeof(struct cell_command_fence); - assert(sizeof(struct cell_command_fence) % 8 == 0); } @@ -191,70 +192,19 @@ cell_batch_free_space(const struct cell_context *cell) } -/** - * Append data to the current batch buffer. - * \param data address of block of bytes to append - * \param bytes size of block of bytes - */ -void -cell_batch_append(struct cell_context *cell, const void *data, uint bytes) -{ - uint size; - - ASSERT(bytes % 8 == 0); - ASSERT(bytes <= CELL_BUFFER_SIZE); - ASSERT(cell->cur_batch >= 0); - -#ifdef ASSERT - { - uint spu; - for (spu = 0; spu < cell->num_spus; spu++) { - ASSERT(cell->buffer_status[spu][cell->cur_batch][0] - == CELL_BUFFER_STATUS_USED); - } - } -#endif - - size = cell->buffer_size[cell->cur_batch]; - - if (bytes > cell_batch_free_space(cell)) { - cell_batch_flush(cell); - size = 0; - } - - ASSERT(size + bytes <= CELL_BUFFER_SIZE); - - memcpy(cell->buffer[cell->cur_batch] + size, data, bytes); - - cell->buffer_size[cell->cur_batch] = size + bytes; -} - - /** * Allocate space in the current batch buffer for 'bytes' space. + * Bytes must be a multiple of 16 bytes. Allocation will be 16 byte aligned. * \return address in batch buffer to put data */ void * -cell_batch_alloc(struct cell_context *cell, uint bytes) -{ - return cell_batch_alloc_aligned(cell, bytes, 1); -} - - -/** - * Same as \sa cell_batch_alloc, but return an address at a particular - * alignment. - */ -void * -cell_batch_alloc_aligned(struct cell_context *cell, uint bytes, - uint alignment) +cell_batch_alloc16(struct cell_context *cell, uint bytes) { void *pos; - uint size, padbytes; + uint size; - ASSERT(bytes % 8 == 0); + ASSERT(bytes % 16 == 0); ASSERT(bytes <= CELL_BUFFER_SIZE); - ASSERT(alignment > 0); ASSERT(cell->cur_batch >= 0); #ifdef ASSERT @@ -269,17 +219,12 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes, size = cell->buffer_size[cell->cur_batch]; - padbytes = (alignment - (size % alignment)) % alignment; - - if (padbytes + bytes > cell_batch_free_space(cell)) { + if (bytes > cell_batch_free_space(cell)) { cell_batch_flush(cell); size = 0; } - else { - size += padbytes; - } - ASSERT(size % alignment == 0); + ASSERT(size % 16 == 0); ASSERT(size + bytes <= CELL_BUFFER_SIZE); pos = (void *) (cell->buffer[cell->cur_batch] + size); diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h index f74dd60079..290136031a 100644 --- a/src/gallium/drivers/cell/ppu/cell_batch.h +++ b/src/gallium/drivers/cell/ppu/cell_batch.h @@ -44,15 +44,8 @@ cell_batch_flush(struct cell_context *cell); extern uint cell_batch_free_space(const struct cell_context *cell); -extern void -cell_batch_append(struct cell_context *cell, const void *data, uint bytes); - -extern void * -cell_batch_alloc(struct cell_context *cell, uint bytes); - extern void * -cell_batch_alloc_aligned(struct cell_context *cell, uint bytes, - uint alignment); +cell_batch_alloc16(struct cell_context *cell, uint bytes); extern void cell_init_batch_buffers(struct cell_context *cell); diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c index 037635e466..c2e276988c 100644 --- a/src/gallium/drivers/cell/ppu/cell_clear.c +++ b/src/gallium/drivers/cell/ppu/cell_clear.c @@ -99,10 +99,11 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, /* Build a CLEAR command and place it in the current batch buffer */ { + STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0); struct cell_command_clear_surface *clr = (struct cell_command_clear_surface *) - cell_batch_alloc(cell, sizeof(*clr)); - clr->opcode = CELL_CMD_CLEAR_SURFACE; + cell_batch_alloc16(cell, sizeof(*clr)); + clr->opcode[0] = CELL_CMD_CLEAR_SURFACE; clr->surface = surfIndex; clr->value = clearValue; } diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c index a64967b4b9..8275c9dc9c 100644 --- a/src/gallium/drivers/cell/ppu/cell_flush.c +++ b/src/gallium/drivers/cell/ppu/cell_flush.c @@ -72,8 +72,9 @@ cell_flush_int(struct cell_context *cell, unsigned flags) flushing = TRUE; if (flags & CELL_FLUSH_WAIT) { - uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t)); - *cmd = CELL_CMD_FINISH; + STATIC_ASSERT(sizeof(opcode_t) % 16 == 0); + opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t)); + *cmd[0] = CELL_CMD_FINISH; } cell_batch_flush(cell); @@ -101,11 +102,11 @@ void cell_flush_buffer_range(struct cell_context *cell, void *ptr, unsigned size) { - uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)]; - struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1]; - + STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0); + uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, + sizeof(opcode_t) + sizeof(struct cell_buffer_range)); + struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4]; batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE; br->base = (uintptr_t) ptr; br->size = size; - cell_batch_append(cell, batch, sizeof(batch)); } diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 0a0af81f53..39b85faeb8 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -133,7 +133,7 @@ lookup_fragment_ops(struct cell_context *cell) */ ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size); /* populate the new cell_command_fragment_ops object */ - ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; + ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS; ops->total_code_size = total_code_size; ops->front_code_index = 0; memcpy(ops->code, spe_code_front.store, front_code_size); @@ -178,10 +178,10 @@ static void emit_state_cmd(struct cell_context *cell, uint cmd, const void *state, uint state_size) { - uint64_t *dst = (uint64_t *) - cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size)); + uint32_t *dst = (uint32_t *) + cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size)); *dst = cmd; - memcpy(dst + 1, state, state_size); + memcpy(dst + 4, state, state_size); } @@ -195,9 +195,10 @@ cell_emit_state(struct cell_context *cell) if (cell->dirty & CELL_NEW_FRAMEBUFFER) { struct pipe_surface *cbuf = cell->framebuffer.cbufs[0]; struct pipe_surface *zbuf = cell->framebuffer.zsbuf; + STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0); struct cell_command_framebuffer *fb - = cell_batch_alloc(cell, sizeof(*fb)); - fb->opcode = CELL_CMD_STATE_FRAMEBUFFER; + = cell_batch_alloc16(cell, sizeof(*fb)); + fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER; fb->color_start = cell->cbuf_map[0]; fb->color_format = cbuf->format; fb->depth_start = cell->zsbuf_map; @@ -211,17 +212,19 @@ cell_emit_state(struct cell_context *cell) } if (cell->dirty & (CELL_NEW_RASTERIZER)) { + STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0); struct cell_command_rasterizer *rast = - cell_batch_alloc(cell, sizeof(*rast)); - rast->opcode = CELL_CMD_STATE_RASTERIZER; + cell_batch_alloc16(cell, sizeof(*rast)); + rast->opcode[0] = CELL_CMD_STATE_RASTERIZER; rast->rasterizer = *cell->rasterizer; } if (cell->dirty & (CELL_NEW_FS)) { /* Send new fragment program to SPUs */ + STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0); struct cell_command_fragment_program *fp - = cell_batch_alloc(cell, sizeof(*fp)); - fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM; + = cell_batch_alloc16(cell, sizeof(*fp)); + fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM; fp->num_inst = cell->fs->code.num_inst; memcpy(&fp->code, cell->fs->code.store, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); @@ -238,14 +241,14 @@ cell_emit_state(struct cell_context *cell) const uint shader = PIPE_SHADER_FRAGMENT; const uint num_const = cell->constants[shader].size / sizeof(float); uint i, j; - float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float)); - uint64_t *ibuf = (uint64_t *) buf; + float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float))); + uint32_t *ibuf = (uint32_t *) buf; const float *constants = pipe_buffer_map(cell->pipe.screen, cell->constants[shader].buffer, PIPE_BUFFER_USAGE_CPU_READ); ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS; - ibuf[1] = num_const; - j = 4; + ibuf[4] = num_const; + j = 8; for (i = 0; i < num_const; i++) { buf[j++] = constants[i]; } @@ -258,7 +261,7 @@ cell_emit_state(struct cell_context *cell) struct cell_command_fragment_ops *fops, *fops_cmd; /* Note that cell_command_fragment_ops is a variant-sized record */ fops = lookup_fragment_ops(cell); - fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size); + fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size)); memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size); } @@ -267,9 +270,10 @@ cell_emit_state(struct cell_context *cell) for (i = 0; i < CELL_MAX_SAMPLERS; i++) { if (cell->dirty_samplers & (1 << i)) { if (cell->sampler[i]) { + STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0); struct cell_command_sampler *sampler - = cell_batch_alloc(cell, sizeof(*sampler)); - sampler->opcode = CELL_CMD_STATE_SAMPLER; + = cell_batch_alloc16(cell, sizeof(*sampler)); + sampler->opcode[0] = CELL_CMD_STATE_SAMPLER; sampler->unit = i; sampler->state = *cell->sampler[i]; } @@ -282,9 +286,10 @@ cell_emit_state(struct cell_context *cell) uint i; for (i = 0;i < CELL_MAX_SAMPLERS; i++) { if (cell->dirty_textures & (1 << i)) { + STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0); struct cell_command_texture *texture - = cell_batch_alloc(cell, sizeof(*texture)); - texture->opcode = CELL_CMD_STATE_TEXTURE; + = (struct cell_command_texture *)cell_batch_alloc16(cell, sizeof(*texture)); + texture->opcode[0] = CELL_CMD_STATE_TEXTURE; texture->unit = i; if (cell->texture[i]) { uint level; diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c index 65ba51b6bb..ab54e79689 100644 --- a/src/gallium/drivers/cell/ppu/cell_vbuf.c +++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c @@ -116,10 +116,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, /* Tell SPUs they can release the vert buf */ if (cvbr->vertex_buf != ~0U) { + STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0); struct cell_command_release_verts *release = (struct cell_command_release_verts *) - cell_batch_alloc(cell, sizeof(struct cell_command_release_verts)); - release->opcode = CELL_CMD_RELEASE_VERTS; + cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts)); + release->opcode[0] = CELL_CMD_RELEASE_VERTS; release->vertex_buf = cvbr->vertex_buf; } @@ -210,15 +211,16 @@ cell_vbuf_draw(struct vbuf_render *vbr, /* build/insert batch RENDER command */ { - const uint index_bytes = ROUNDUP8(nr_indices * 2); - const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size; + const uint index_bytes = ROUNDUP16(nr_indices * 2); + const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size); + STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0); const uint batch_size = sizeof(struct cell_command_render) + index_bytes; struct cell_command_render *render = (struct cell_command_render *) - cell_batch_alloc(cell, batch_size); + cell_batch_alloc16(cell, batch_size); - render->opcode = CELL_CMD_RENDER; + render->opcode[0] = CELL_CMD_RENDER; render->prim_type = cvbr->prim; render->num_indexes = nr_indices; @@ -236,7 +238,7 @@ cell_vbuf_draw(struct vbuf_render *vbr, min_index == 0 && vertex_bytes + 16 <= cell_batch_free_space(cell)) { /* vertex data inlined, after indices, at 16-byte boundary */ - void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16); + void *dst = cell_batch_alloc16(cell, vertex_bytes); memcpy(dst, vertices, vertex_bytes); render->inline_verts = TRUE; render->vertex_buf = ~0; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index 8500d19754..5c0179d954 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -292,10 +292,10 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp) static uint -cmd_state_fs_constants(const uint64_t *buffer, uint pos) +cmd_state_fs_constants(const qword *buffer, uint pos) { - const uint num_const = buffer[pos + 1]; - const float *constants = (const float *) &buffer[pos + 2]; + const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0); + const float *constants = (const float *) &buffer[pos+2]; uint i; D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const); @@ -306,8 +306,8 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos) spu.constants[i] = spu_splats(constants[i]); } - /* return new buffer pos (in 8-byte words) */ - return pos + 2 + num_const / 2; + /* return new buffer pos (in 16-byte words) */ + return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16); } @@ -547,8 +547,8 @@ cmd_batch(uint opcode) { const uint buf = (opcode >> 8) & 0xff; uint size = (opcode >> 16); - uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB; - const unsigned usize = size / sizeof(buffer[0]); + qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB; + const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]); uint pos; D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n", @@ -578,7 +578,7 @@ cmd_batch(uint opcode) * Loop over commands in the batch buffer */ for (pos = 0; pos < usize; /* no incr */) { - switch (buffer[pos]) { + switch (si_to_uint(buffer[pos])) { /* * rendering commands */ @@ -587,7 +587,7 @@ cmd_batch(uint opcode) struct cell_command_clear_surface *clr = (struct cell_command_clear_surface *) &buffer[pos]; cmd_clear_surface(clr); - pos += sizeof(*clr) / 8; + pos += sizeof(*clr) / 16; } break; case CELL_CMD_RENDER: @@ -596,7 +596,7 @@ cmd_batch(uint opcode) = (struct cell_command_render *) &buffer[pos]; uint pos_incr; cmd_render(render, &pos_incr); - pos += pos_incr; + pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return } break; /* @@ -607,7 +607,7 @@ cmd_batch(uint opcode) struct cell_command_framebuffer *fb = (struct cell_command_framebuffer *) &buffer[pos]; cmd_state_framebuffer(fb); - pos += sizeof(*fb) / 8; + pos += sizeof(*fb) / 16; } break; case CELL_CMD_STATE_FRAGMENT_OPS: @@ -616,7 +616,7 @@ cmd_batch(uint opcode) = (struct cell_command_fragment_ops *) &buffer[pos]; cmd_state_fragment_ops(fops); /* This is a variant-sized command */ - pos += (sizeof(*fops) + fops->total_code_size)/ 8; + pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16; } break; case CELL_CMD_STATE_FRAGMENT_PROGRAM: @@ -624,7 +624,7 @@ cmd_batch(uint opcode) struct cell_command_fragment_program *fp = (struct cell_command_fragment_program *) &buffer[pos]; cmd_state_fragment_program(fp); - pos += sizeof(*fp) / 8; + pos += sizeof(*fp) / 16; } break; case CELL_CMD_STATE_FS_CONSTANTS: @@ -635,7 +635,7 @@ cmd_batch(uint opcode) struct cell_command_rasterizer *rast = (struct cell_command_rasterizer *) &buffer[pos]; spu.rasterizer = rast->rasterizer; - pos += sizeof(*rast) / 8; + pos += sizeof(*rast) / 16; } break; case CELL_CMD_STATE_SAMPLER: @@ -643,7 +643,7 @@ cmd_batch(uint opcode) struct cell_command_sampler *sampler = (struct cell_command_sampler *) &buffer[pos]; cmd_state_sampler(sampler); - pos += sizeof(*sampler) / 8; + pos += sizeof(*sampler) / 16; } break; case CELL_CMD_STATE_TEXTURE: @@ -651,37 +651,37 @@ cmd_batch(uint opcode) struct cell_command_texture *texture = (struct cell_command_texture *) &buffer[pos]; cmd_state_texture(texture); - pos += sizeof(*texture) / 8; + pos += sizeof(*texture) / 16; } break; case CELL_CMD_STATE_VERTEX_INFO: cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8); + pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16; break; case CELL_CMD_STATE_VIEWPORT: (void) memcpy(& draw.viewport, &buffer[pos+1], sizeof(struct pipe_viewport_state)); - pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8); + pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16; break; case CELL_CMD_STATE_UNIFORMS: - draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1]; + draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0); pos += 2; break; case CELL_CMD_STATE_VS_ARRAY_INFO: cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); + pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16; break; case CELL_CMD_STATE_BIND_VS: #if 0 spu_bind_vertex_shader(&draw, (struct cell_shader_info *) &buffer[pos+1]); #endif - pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); + pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16; break; case CELL_CMD_STATE_ATTRIB_FETCH: cmd_state_attrib_fetch((struct cell_attribute_fetch_code *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); + pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16; break; /* * misc commands @@ -695,7 +695,7 @@ cmd_batch(uint opcode) struct cell_command_fence *fence_cmd = (struct cell_command_fence *) &buffer[pos]; cmd_fence(fence_cmd); - pos += sizeof(*fence_cmd) / 8; + pos += sizeof(*fence_cmd) / 16; } break; case CELL_CMD_RELEASE_VERTS: @@ -703,7 +703,7 @@ cmd_batch(uint opcode) struct cell_command_release_verts *release = (struct cell_command_release_verts *) &buffer[pos]; cmd_release_verts(release); - pos += sizeof(*release) / 8; + pos += sizeof(*release) / 16; } break; case CELL_CMD_FLUSH_BUFFER_RANGE: { @@ -711,11 +711,11 @@ cmd_batch(uint opcode) &buffer[pos+1]; spu_dcache_mark_dirty((unsigned) br->base, br->size); - pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8); + pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16; break; } default: - printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]); + printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos])); ASSERT(0); break; } -- cgit v1.2.3