From bb5becf1e289b2c9240d98299e9447a9673da9fc Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Fri, 5 Sep 2008 13:54:14 -0600 Subject: gallium: comments, assertions, etc --- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 36 +++++++++++++++++++++++++---- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 20 +++++++++------- 2 files changed, 43 insertions(+), 13 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index 285ddc0e3f..fe5beba456 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -300,7 +300,9 @@ void _name (struct spe_function *p, int imm) \ #include "rtasm_ppc_spe.h" -/* +/** + * Initialize an spe_function. + * \param code_size size of instruction buffer to allocate, in bytes. */ void spe_init_func(struct spe_function *p, unsigned code_size) { @@ -324,10 +326,14 @@ void spe_release_func(struct spe_function *p) } +/** + * Alloate a SPE register. + * \return register index or -1 if none left. + */ int spe_allocate_available_register(struct spe_function *p) { unsigned i; - for (i = 0; i < 128; i++) { + for (i = 0; i < SPE_NUM_REGS; i++) { const uint64_t mask = (1ULL << (i % 64)); const unsigned idx = i / 64; @@ -341,11 +347,15 @@ int spe_allocate_available_register(struct spe_function *p) } +/** + * Mark the given SPE register as "allocated". + */ int spe_allocate_register(struct spe_function *p, int reg) { const unsigned idx = reg / 64; const unsigned bit = reg % 64; + assert(reg < SPE_NUM_REGS); assert((p->regs[idx] & (1ULL << bit)) != 0); p->regs[idx] &= ~(1ULL << bit); @@ -353,57 +363,73 @@ int spe_allocate_register(struct spe_function *p, int reg) } +/** + * Mark the given SPE register as "unallocated". + */ void spe_release_register(struct spe_function *p, int reg) { const unsigned idx = reg / 64; const unsigned bit = reg % 64; + assert(reg < SPE_NUM_REGS); assert((p->regs[idx] & (1ULL << bit)) == 0); p->regs[idx] |= (1ULL << bit); } +/** + * For branch instructions: + * \param d if 1, disable interupts if branch is taken + * \param e if 1, enable interupts if branch is taken + * If d and e are both zero, don't change interupt status (right?) + */ - +/** Branch Indirect to address in rA */ void spe_bi(struct spe_function *p, unsigned rA, int d, int e) { emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4)); } +/** Interupt Return */ void spe_iret(struct spe_function *p, unsigned rA, int d, int e) { emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4)); } +/** Branch indirect and set link on external data */ void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4)); } +/** Branch indirect and set link. Save PC in rT, jump to rA. */ void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4)); } -void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, - int e) +/** Branch indirect if zero word. If rT.word[0]==0, jump to rA. */ +void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4)); } +/** Branch indirect if non-zero word. If rT.word[0]!=0, jump to rA. */ void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4)); } +/** Branch indirect if zero halfword. If rT.halfword[1]==0, jump to rA. */ void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4)); } +/** Branch indirect if non-zero halfword. If rT.halfword[1]!=0, jump to rA. */ void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4)); diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index 1cacc717b1..7dd754ba77 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -32,13 +32,17 @@ #ifndef RTASM_PPC_SPE_H #define RTASM_PPC_SPE_H -struct spe_function { - /** - * - */ - uint32_t *store; - uint32_t *csr; - const char *fn; +/** 4 bytes per instruction */ +#define SPE_INST_SIZE 4 + +/** number of general-purpose SIMD registers */ +#define SPE_NUM_REGS 128 + +struct spe_function +{ + uint32_t *store; /**< instruction buffer */ + uint32_t *csr; /**< next free pos in instruction buffer */ + const char *fn; /**< unused */ /** * Mask of used / unused registers @@ -50,7 +54,7 @@ struct spe_function { * spe_allocate_register, spe_allocate_available_register, * spe_release_register */ - uint64_t regs[2]; + uint64_t regs[SPE_NUM_REGS / 64]; }; extern void spe_init_func(struct spe_function *p, unsigned code_size); -- cgit v1.2.3 From 0e79e474de164a765b9759398c83b6bfa16a0012 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Fri, 5 Sep 2008 13:55:02 -0600 Subject: cell: comments, etc. --- .../drivers/cell/ppu/cell_state_per_fragment.c | 28 ++++++++++---- src/gallium/drivers/cell/ppu/cell_vertex_fetch.c | 5 +-- src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 44 +++++++++++++++------- 3 files changed, 52 insertions(+), 25 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c index 53ae3aa50e..705867107b 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c @@ -132,9 +132,9 @@ emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa, /** + * Generate code to perform Z testing. Four Z values are tested at once. * \param dsa Current depth-test state * \param f Function to which code should be appended - * \param m Mask of allocated / free SPE registers * \param mask Index of register to contain depth-pass mask * \param stored Index of register containing values from depth buffer * \param calculated Index of register containing per-fragment depth values @@ -198,6 +198,7 @@ emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa, /** + * Generate code to apply the stencil operation (after testing). * \note Emits a maximum of 5 instructions. * * \warning @@ -222,9 +223,13 @@ emit_stencil_op(struct spe_function *f, spe_il(f, result, ref); break; case PIPE_STENCIL_OP_INCR: + /* clamp = [0xff, 0xff, 0xff, 0xff] */ spe_il(f, clamp, 0x0ff); + /* result[i] = in[i] + 1 */ spe_ai(f, result, in, 1); + /* clamp_mask[i] = (result[i] > 0xff) */ spe_clgti(f, clamp_mask, result, 0x0ff); + /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */ spe_selb(f, result, result, clamp, clamp_mask); break; case PIPE_STENCIL_OP_DECR: @@ -259,10 +264,10 @@ emit_stencil_op(struct spe_function *f, /** + * Generate code to do stencil test. Four pixels are tested at once. * \param dsa Depth / stencil test state * \param face 0 for front face, 1 for back face * \param f Function to append instructions to - * \param reg_mask Mask of allocated registers * \param mask Register containing mask of fragments passing the * alpha test * \param depth_mask Register containing mask of fragments passing the @@ -310,13 +315,14 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa, switch (dsa->stencil[face].func) { case PIPE_FUNC_NEVER: - spe_il(f, stencil_mask, 0); + spe_il(f, stencil_mask, 0); /* stencil_mask[0..3] = [0,0,0,0] */ break; case PIPE_FUNC_NOTEQUAL: complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_EQUAL: + /* stencil_mask[i] = (stored[i] == ref) */ spe_ceqi(f, stencil_mask, stored, ref); break; @@ -324,6 +330,8 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa, complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GREATER: + complement = TRUE; + /* stencil_mask[i] = (stored[i] > ref) */ spe_clgti(f, stencil_mask, stored, ref); break; @@ -331,8 +339,11 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa, complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GEQUAL: + /* stencil_mask[i] = (stored[i] > ref) */ spe_clgti(f, stencil_mask, stored, ref); + /* tmp[i] = (stored[i] == ref) */ spe_ceqi(f, tmp, stored, ref); + /* stencil_mask[i] = stencil_mask[i] | tmp[i] */ spe_or(f, stencil_mask, stencil_mask, tmp); break; @@ -461,7 +472,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa) * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions. Round * up to 64 to make it a happy power-of-two. */ - spe_init_func(f, 4 * 64); + spe_init_func(f, SPE_INST_SIZE * 64); /* Allocate registers for the function's input parameters. Cleverly (and @@ -540,7 +551,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa) spe_selb(f, depth, depth, zvals, mask); } - spe_bi(f, 0, 0, 0); + spe_bi(f, 0, 0, 0); /* return from function call */ #if 0 @@ -956,7 +967,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb) * + 4 (fragment mask) + 1 (return) = 55 instlructions. Round up to 64 to * make it a happy power-of-two. */ - spe_init_func(f, 4 * 64); + spe_init_func(f, SPE_INST_SIZE * 64); const int frag[4] = { @@ -1144,7 +1155,8 @@ cell_generate_alpha_blend(struct cell_blend_state *cb) } -int PC_OFFSET(const struct spe_function *f, const void *d) +static int +PC_OFFSET(const struct spe_function *f, const void *d) { const intptr_t pc = (intptr_t) f->csr; const intptr_t ea = ~0x0f & (intptr_t) d; @@ -1178,7 +1190,7 @@ cell_generate_logic_op(struct spe_function *f, * bytes (equiv. to 8 instructions) are needed for data storage. Round up * to 64 to make it a happy power-of-two. */ - spe_init_func(f, 4 * 64); + spe_init_func(f, SPE_INST_SIZE * 64); /* Pixel colors in framebuffer format in AoS layout. diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c index 2ece0250f6..566df7f59e 100644 --- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c +++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c @@ -297,10 +297,9 @@ void cell_update_vertex_fetch(struct draw_context *draw) /* Each fetch function can be a maximum of 34 instructions (note: this is - * actually a slight over-estimate). That means (34 * 4) = 136 bytes - * each maximum. + * actually a slight over-estimate). */ - spe_init_func(p, 136 * unique_attr_formats); + spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats); /* Allocate registers for the function's input parameters. diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index c0a729b3d2..db88735226 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -35,8 +35,17 @@ #define ZERO 0x80 + +/** + * Get a "quad" of four fragment Z/stencil values from the given tile. + * \param tile the tile of Z/stencil values + * \param x, y location of the quad in the tile, in pixels + * \param depth_format format of the tile's data + * \param detph returns four depth values + * \param stencil returns four stencil values + */ static void -read_ds_quad(tile_t *buffer, unsigned x, unsigned y, +read_ds_quad(tile_t *tile, unsigned x, unsigned y, enum pipe_format depth_format, qword *depth, qword *stencil) { @@ -45,14 +54,13 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y, switch (depth_format) { case PIPE_FORMAT_Z16_UNORM: { - qword *ptr = (qword *) &buffer->us8[iy][ix / 2]; + qword *ptr = (qword *) &tile->us8[iy][ix / 2]; const qword shuf_vec = (qword) { ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3, ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7 }; - /* At even X values we want the first 4 shorts, and at odd X values we * want the second 4 shorts. */ @@ -65,18 +73,16 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y, break; } - case PIPE_FORMAT_Z32_UNORM: { - qword *ptr = (qword *) &buffer->ui4[iy][ix]; + qword *ptr = (qword *) &tile->ui4[iy][ix]; *depth = *ptr; *stencil = si_il(0); break; } - case PIPE_FORMAT_Z24S8_UNORM: { - qword *ptr = (qword *) &buffer->ui4[iy][ix]; + qword *ptr = (qword *) &tile->ui4[iy][ix]; qword mask = si_fsmbi(0xEEEE); *depth = si_rotmai(si_and(*ptr, mask), -8); @@ -84,16 +90,14 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y, break; } - case PIPE_FORMAT_S8Z24_UNORM: { - qword *ptr = (qword *) &buffer->ui4[iy][ix]; + qword *ptr = (qword *) &tile->ui4[iy][ix]; *depth = si_and(*ptr, si_fsmbi(0x7777)); *stencil = si_andi(si_roti(*ptr, 8), 0x0ff); break; } - default: ASSERT(0); break; @@ -101,6 +105,14 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y, } +/** + * Put a quad of Z/stencil values into a tile. + * \param tile the tile of Z/stencil values to write into + * \param x, y location of the quad in the tile, in pixels + * \param depth_format format of the tile's data + * \param detph depth values to store + * \param stencil stencil values to store + */ static void write_ds_quad(tile_t *buffer, unsigned x, unsigned y, enum pipe_format depth_format, @@ -124,14 +136,12 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y, break; } - case PIPE_FORMAT_Z32_UNORM: { qword *ptr = (qword *) &buffer->ui4[iy][ix]; *ptr = depth; break; } - case PIPE_FORMAT_Z24S8_UNORM: { qword *ptr = (qword *) &buffer->ui4[iy][ix]; qword mask = si_fsmbi(0xEEEE); @@ -141,7 +151,6 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y, break; } - case PIPE_FORMAT_S8Z24_UNORM: { qword *ptr = (qword *) &buffer->ui4[iy][ix]; qword mask = si_fsmbi(0x7777); @@ -151,7 +160,6 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y, break; } - default: ASSERT(0); break; @@ -159,6 +167,14 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y, } +/** + * Do depth/stencil/alpha test for a "quad" of 4 fragments. + * \param x,y location of quad within tile + * \param frag_mask indicates which fragments are "alive" + * \param frag_depth four fragment depth values + * \param frag_alpha four fragment alpha values + * \param facing front/back facing for four fragments (1=front, 0=back) + */ qword spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth, qword frag_alpha, -- cgit v1.2.3 From cd9722dcddcb41af3196860280d23542dc673700 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 8 Sep 2008 11:50:13 -0600 Subject: cell: comments --- src/gallium/drivers/cell/spu/spu_tri.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 2a4e0b423c..a3ea0a3e69 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -209,7 +209,7 @@ clip_emit_quad(struct setup_stage *setup) /** * Evaluate attribute coefficients (plane equations) to compute * attribute values for the four fragments in a quad. - * Eg: four colors will be compute. + * Eg: four colors will be computed (in AoS format). */ static INLINE void eval_coeff(uint slot, float x, float y, vector float result[4]) @@ -356,6 +356,7 @@ emit_quad( int x, int y, mask_t mask ) /* Convert fragment data from AoS to SoA format. + * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) */ qword soa_frag[4]; _transpose_matrix4x4((vec_float4 *) soa_frag, colors); @@ -373,6 +374,7 @@ emit_quad( int x, int y, mask_t mask ) if (spu.read_fb) { /* Convert pixel data from AoS to SoA format. + * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) */ vec_float4 aos_pix[4] = { spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]), @@ -393,6 +395,7 @@ emit_quad( int x, int y, mask_t mask ) /* Convert final pixel data from SoA to AoS format. + * I.e. (RRRR,GGGG,BBBB,AAAA) -> (RGBA,RGBA,RGBA,RGBA) */ result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3], result.r, result.g, result.b, result.a, -- cgit v1.2.3 From 04ae4fba3c0a656cf2747fc994b99f99576d0e2b Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 8 Sep 2008 11:53:14 -0600 Subject: cell: minor change to Z float/int conversion code (avoid switch) --- src/gallium/drivers/cell/spu/spu_main.c | 5 ++++ src/gallium/drivers/cell/spu/spu_main.h | 5 ++++ src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 34 +++++++++------------- 3 files changed, 23 insertions(+), 21 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index d223f32d94..c4236817a9 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -252,12 +252,17 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) switch (spu.fb.depth_format) { case PIPE_FORMAT_Z32_UNORM: + spu.fb.zsize = 4; + spu.fb.zscale = (float) 0xffffffffu; + break; case PIPE_FORMAT_Z24S8_UNORM: case PIPE_FORMAT_S8Z24_UNORM: spu.fb.zsize = 4; + spu.fb.zscale = (float) 0x00ffffffu; break; case PIPE_FORMAT_Z16_UNORM: spu.fb.zsize = 2; + spu.fb.zscale = (float) 0xffffu; break; default: spu.fb.zsize = 0; diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 4879f8c9c8..c2a53c9dcf 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -41,6 +41,10 @@ #define MAX_HEIGHT 1024 +/** + * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels. + * The data may be addressed through several different types. + */ typedef union { ushort us[TILE_SIZE][TILE_SIZE]; uint ui[TILE_SIZE][TILE_SIZE]; @@ -99,6 +103,7 @@ struct spu_framebuffer { uint depth_clear_value; uint zsize; /**< 0, 2 or 4 bytes per Z */ + float zscale; /**< 65535.0, 2^24-1 or 2^32-1 */ } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index db88735226..29dc07a2e8 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -144,18 +144,22 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y, case PIPE_FORMAT_Z24S8_UNORM: { qword *ptr = (qword *) &buffer->ui4[iy][ix]; + /* form select mask = 1110,1110,1110,1110 */ qword mask = si_fsmbi(0xEEEE); - + /* depth[i] = depth[i] << 8 */ depth = si_shli(depth, 8); + /* *ptr[i] = depth[i][31:8] | stencil[i][7:0] */ *ptr = si_selb(stencil, depth, mask); break; } case PIPE_FORMAT_S8Z24_UNORM: { qword *ptr = (qword *) &buffer->ui4[iy][ix]; + /* form select mask = 0111,0111,0111,0111 */ qword mask = si_fsmbi(0x7777); - + /* stencil[i] = stencil[i] << 24 */ stencil = si_shli(stencil, 24); + /* *ptr[i] = stencil[i][31:24] | depth[i][23:0] */ *ptr = si_selb(stencil, depth, mask); break; } @@ -191,25 +195,13 @@ spu_do_depth_stencil(int x, int y, read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format, &pixel_depth, &pixel_stencil); } - - switch (spu.fb.depth_format) { - case PIPE_FORMAT_Z16_UNORM: - frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu))); - frag_depth = si_cfltu(frag_depth, 0); - break; - case PIPE_FORMAT_Z32_UNORM: - frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu))); - frag_depth = si_cfltu(frag_depth, 0); - break; - case PIPE_FORMAT_Z24S8_UNORM: - case PIPE_FORMAT_S8Z24_UNORM: - frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu))); - frag_depth = si_cfltu(frag_depth, 0); - break; - default: - ASSERT(0); - break; - } + + /* convert floating point Z values to 32-bit uint */ + + /* frag_depth *= spu.fb.zscale */ + frag_depth = si_fm(frag_depth, (qword)spu_splats(spu.fb.zscale)); + /* frag_depth = uint(frag_depth) */ + frag_depth = si_cfltu(frag_depth, 0); result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil, frag_depth, frag_alpha, facing); -- cgit v1.2.3 From ee582fd3a7a9ddbcb5595249201cf213a6c6f014 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 10 Sep 2008 17:11:48 -0600 Subject: gallium: assorted additions and fixes to Cell SPE rtasm code Fix incorrect opcode for fsmbi. Added "macro" functions for loading floats/ints, register complement, zero, move. Added #defines for return address and stack pointer registers. Added assertions to check that the instruction buffer doesn't overflow. --- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 88 +++++++++++++++++++++++------ src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 38 +++++++++++-- 2 files changed, 105 insertions(+), 21 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index fe5beba456..61010e4333 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -151,8 +151,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rB = rB; inst.inst.rA = rA; inst.inst.rT = rT; - *p->csr = inst.bits; - p->csr++; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); } @@ -165,8 +165,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rB = rB; inst.inst.rA = rA; inst.inst.rC = rC; - *p->csr = inst.bits; - p->csr++; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); } @@ -178,8 +178,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT, inst.inst.i7 = imm; inst.inst.rA = rA; inst.inst.rT = rT; - *p->csr = inst.bits; - p->csr++; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); } @@ -192,8 +192,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT, inst.inst.i8 = imm; inst.inst.rA = rA; inst.inst.rT = rT; - *p->csr = inst.bits; - p->csr++; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); } @@ -206,8 +206,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT, inst.inst.i10 = imm; inst.inst.rA = rA; inst.inst.rT = rT; - *p->csr = inst.bits; - p->csr++; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); } @@ -218,8 +218,8 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, inst.inst.op = op; inst.inst.i16 = imm; inst.inst.rT = rT; - *p->csr = inst.bits; - p->csr++; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); } @@ -230,8 +230,8 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, inst.inst.op = op; inst.inst.i18 = imm; inst.inst.rT = rT; - *p->csr = inst.bits; - p->csr++; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); } @@ -307,8 +307,9 @@ void _name (struct spe_function *p, int imm) \ void spe_init_func(struct spe_function *p, unsigned code_size) { p->store = align_malloc(code_size, 16); - p->csr = p->store; - + p->num_inst = 0; + p->max_inst = code_size / SPE_INST_SIZE; + /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile. */ p->regs[0] = ~7; @@ -318,11 +319,11 @@ void spe_init_func(struct spe_function *p, unsigned code_size) void spe_release_func(struct spe_function *p) { + assert(p->num_inst <= p->max_inst); if (p->store != NULL) { align_free(p->store); } p->store = NULL; - p->csr = NULL; } @@ -337,6 +338,7 @@ int spe_allocate_available_register(struct spe_function *p) const uint64_t mask = (1ULL << (i % 64)); const unsigned idx = i / 64; + assert(idx < 2); if ((p->regs[idx] & mask) != 0) { p->regs[idx] &= ~mask; return i; @@ -371,6 +373,8 @@ void spe_release_register(struct spe_function *p, int reg) const unsigned idx = reg / 64; const unsigned bit = reg % 64; + assert(idx < 2); + assert(reg < SPE_NUM_REGS); assert((p->regs[idx] & (1ULL << bit)) == 0); @@ -458,4 +462,54 @@ EMIT_R (spe_mfspr, 0x00c); EMIT_R (spe_mtspr, 0x10c); #endif + +/** + ** Helper / "macro" instructions. + ** Use somewhat verbose names as a reminder that these aren't native + ** SPE instructions. + **/ + + +void +spe_load_float(struct spe_function *p, unsigned rT, float x) +{ + union { + float f; + unsigned u; + } bits; + bits.f = x; + spe_ilhu(p, rT, bits.u >> 16); + spe_iohl(p, rT, bits.u & 0xffff); +} + + +void +spe_load_int(struct spe_function *p, unsigned rT, int i) +{ + spe_ilhu(p, rT, i >> 16); + spe_iohl(p, rT, i & 0xffff); +} + + +void +spe_complement(struct spe_function *p, unsigned rT) +{ + spe_nor(p, rT, rT, rT); +} + + +void +spe_move(struct spe_function *p, unsigned rT, unsigned rA) +{ + spe_ori(p, rT, rA, 0); +} + + +void +spe_zero(struct spe_function *p, unsigned rT) +{ + spe_xor(p, rT, rT, rT); +} + + #endif /* GALLIUM_CELL */ diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index 7dd754ba77..dee8c55c4a 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -25,6 +25,7 @@ /** * \file * Real-time assembly generation interface for Cell B.E. SPEs. + * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf * * \author Ian Romanick */ @@ -38,11 +39,18 @@ /** number of general-purpose SIMD registers */ #define SPE_NUM_REGS 128 +/** Return Address register */ +#define SPE_REG_RA 0 + +/** Stack Pointer register */ +#define SPE_REG_SP 1 + + struct spe_function { - uint32_t *store; /**< instruction buffer */ - uint32_t *csr; /**< next free pos in instruction buffer */ - const char *fn; /**< unused */ + uint32_t *store; /**< instruction buffer */ + uint num_inst; + uint max_inst; /** * Mask of used / unused registers @@ -123,7 +131,8 @@ EMIT_RI16(spe_ilhu, 0x082); EMIT_RI16(spe_il, 0x081); EMIT_RI18(spe_ila, 0x021); EMIT_RI16(spe_iohl, 0x0c1); -EMIT_RI16(spe_fsmbi, 0x0c5); +EMIT_RI16(spe_fsmbi, 0x065); + /* Integer and logical instructions @@ -275,6 +284,27 @@ extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e); +/** Load/splat immediate float into rT. */ +extern void +spe_load_float(struct spe_function *p, unsigned rT, float x); + +/** Load/splat immediate int into rT. */ +extern void +spe_load_int(struct spe_function *p, unsigned rT, int i); + +/** Complement/invert all bits in rT. */ +extern void +spe_complement(struct spe_function *p, unsigned rT); + +/** rT = rA. */ +extern void +spe_move(struct spe_function *p, unsigned rT, unsigned rA); + +/** rT = {0,0,0,0}. */ +extern void +spe_zero(struct spe_function *p, unsigned rT); + + /* Floating-point instructions */ EMIT_RR (spe_fa, 0x2c4); -- cgit v1.2.3 From 284ab5a6127f8b452acaa0e10ac1d9ebc87fac3e Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 10 Sep 2008 18:22:00 -0600 Subject: cell: checkpoint commit of new per-fragment processing Do code generation for alpha test, z test, stencil, blend, colormask and framebuffer/tile read/write as a single code block. Ian's previous blend/z/stencil test code is still there but mostly disabled and will be removed soon. --- src/gallium/drivers/cell/common.h | 20 +- src/gallium/drivers/cell/ppu/Makefile | 1 + src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 530 +++++++++++++++++++++ src/gallium/drivers/cell/ppu/cell_gen_fragment.h | 38 ++ src/gallium/drivers/cell/ppu/cell_state_emit.c | 31 +- .../drivers/cell/ppu/cell_state_per_fragment.c | 2 +- src/gallium/drivers/cell/spu/Makefile | 2 +- src/gallium/drivers/cell/spu/spu_main.c | 53 ++- src/gallium/drivers/cell/spu/spu_main.h | 23 + src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 231 ++++++++- src/gallium/drivers/cell/spu/spu_per_fragment_op.h | 11 + src/gallium/drivers/cell/spu/spu_tri.c | 30 ++ src/gallium/winsys/xlib/xm_api.c | 7 +- src/gallium/winsys/xlib/xm_winsys.c | 35 ++ 14 files changed, 998 insertions(+), 16 deletions(-) create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.c create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.h (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index c0ca201e1d..a62530c64d 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -97,6 +97,7 @@ #define CELL_CMD_STATE_LOGICOP 21 #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 +#define CELL_CMD_STATE_FRAGMENT_OPS 24 #define CELL_NUM_BUFFERS 4 @@ -112,30 +113,43 @@ /** */ -struct cell_command_depth_stencil_alpha_test { +struct cell_command_depth_stencil_alpha_test +{ uint64_t base; /**< Effective address of code start. */ unsigned size; /**< Size in bytes of SPE code. */ unsigned read_depth; /**< Flag: should depth be read? */ unsigned read_stencil; /**< Flag: should stencil be read? */ + struct pipe_depth_stencil_alpha_state state; }; /** * Upload code to perform framebuffer blend operation */ -struct cell_command_blend { +struct cell_command_blend +{ uint64_t base; /**< Effective address of code start. */ unsigned size; /**< Size in bytes of SPE code. */ unsigned read_fb; /**< Flag: should framebuffer be read? */ }; -struct cell_command_logicop { +struct cell_command_logicop +{ uint64_t base; /**< Effective address of code start. */ unsigned size; /**< Size in bytes of SPE code. */ }; +#define SPU_MAX_FRAGMENT_OPS_INSTS 64 + +struct cell_command_fragment_ops +{ + uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ + unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS]; +}; + + /** * Tell SPUs about the framebuffer size, location */ diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index 25473e200c..b5a6fcb8de 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -25,6 +25,7 @@ SOURCES = \ cell_context.c \ cell_draw_arrays.c \ cell_flush.c \ + cell_gen_fragment.c \ cell_state_derived.c \ cell_state_emit.c \ cell_state_per_fragment.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c new file mode 100644 index 0000000000..df29476be6 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -0,0 +1,530 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +/** + * Generate SPU per-fragment code (actually per-quad code). + * \author Brian Paul + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "cell_context.h" +#include "cell_gen_fragment.h" + + + +/** Do extra optimizations? */ +#define OPTIMIZATIONS 1 + + +/** + * Generate SPE code to perform Z/depth testing. + * + * \param dsa Gallium depth/stencil/alpha state to gen code for + * \param f SPE function to append instruction onto. + * \param mask_reg register containing quad/pixel "alive" mask (in/out) + * \param ifragZ_reg register containing integer fragment Z values (in) + * \param ifbZ_reg register containing integer frame buffer Z values (in/out) + * \param zmask_reg register containing result of Z test/comparison (out) + */ +static void +gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, + int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg) +{ + ASSERT(dsa->depth.enabled); + + switch (dsa->depth.func) { + case PIPE_FUNC_EQUAL: + /* zmask = (ifragZ == ref) */ + spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_NOTEQUAL: + /* zmask = (ifragZ == ref) */ + spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_GREATER: + /* zmask = (ifragZ > ref) */ + spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_LESS: + /* zmask = (ref > ifragZ) */ + spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_LEQUAL: + /* zmask = (ifragZ > ref) */ + spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_GEQUAL: + /* zmask = (ref > ifragZ) */ + spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_NEVER: + spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */ + spe_move(f, zmask_reg, mask_reg); /* zmask = mask */ + break; + + case PIPE_FUNC_ALWAYS: + /* mask unchanged */ + spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */ + break; + + default: + ASSERT(0); + break; + } + + if (dsa->depth.writemask) { + /* + * If (ztest passed) { + * framebufferZ = fragmentZ; + * } + * OR, + * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ; + */ + spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg); + } +} + + +/** + * Generate SPE code to perform alpha testing. + * + * \param dsa Gallium depth/stencil/alpha state to gen code for + * \param f SPE function to append instruction onto. + * \param mask_reg register containing quad/pixel "alive" mask (in/out) + * \param fragA_reg register containing four fragment alpha values (in) + */ +static void +gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, int mask_reg, int fragA_reg) +{ + int ref_reg = spe_allocate_available_register(f); + int amask_reg = spe_allocate_available_register(f); + + ASSERT(dsa->alpha.enabled); + + if ((dsa->alpha.func != PIPE_FUNC_NEVER) && + (dsa->alpha.func != PIPE_FUNC_ALWAYS)) { + /* load/splat the alpha reference float value */ + spe_load_float(f, ref_reg, dsa->alpha.ref); + } + + /* emit code to do the alpha comparison, updating 'mask' */ + switch (dsa->alpha.func) { + case PIPE_FUNC_EQUAL: + /* amask = (fragA == ref) */ + spe_fceq(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_NOTEQUAL: + /* amask = (fragA == ref) */ + spe_fceq(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_GREATER: + /* amask = (fragA > ref) */ + spe_fcgt(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_LESS: + /* amask = (ref > fragA) */ + spe_fcgt(f, amask_reg, ref_reg, fragA_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_LEQUAL: + /* amask = (fragA > ref) */ + spe_fcgt(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_GEQUAL: + /* amask = (ref > fragA) */ + spe_fcgt(f, amask_reg, ref_reg, fragA_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_NEVER: + spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */ + break; + + case PIPE_FUNC_ALWAYS: + /* no-op, mask unchanged */ + break; + + default: + ASSERT(0); + break; + } + +#if OPTIMIZATIONS + /* if mask == {0,0,0,0} we're all done, return */ + { + /* re-use amask reg here */ + int tmp_reg = amask_reg; + /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */ + spe_orx(f, tmp_reg, mask_reg); + /* if tmp[0] == 0 then return from function call */ + spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0); + } +#endif + + spe_release_register(f, ref_reg); + spe_release_register(f, amask_reg); +} + + + +/** + * Generate SPE code to implement the fragment operations (alpha test, + * depth test, stencil test, blending, colormask, and final + * framebuffer write) as specified by the current context state. + * + * Logically, this code will be called after running the fragment + * shader. But under some circumstances we could run some of this + * code before the fragment shader to cull fragments/quads that are + * totally occluded/discarded. + * + * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now. + * + * See the spu_default_fragment_ops() function to see how the per-fragment + * operations would be done with ordinary C code. + * The code we generate here though has no branches, is SIMD, etc and + * should be much faster. + * + * \param cell the rendering context (in) + * \param f the generated function (out) + */ +void +gen_fragment_function(struct cell_context *cell, struct spe_function *f) +{ + const struct pipe_depth_stencil_alpha_state *dsa = + &cell->depth_stencil->base; + const struct pipe_blend_state *blend = &cell->blend->base; + + /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ + const int x_reg = 3; /* uint */ + const int y_reg = 4; /* uint */ + const int color_tile_reg = 5; /* tile_t * */ + const int depth_tile_reg = 6; /* tile_t * */ + const int fragZ_reg = 7; /* vector float */ + const int fragR_reg = 8; /* vector float */ + const int fragG_reg = 9; /* vector float */ + const int fragB_reg = 10; /* vector float */ + const int fragA_reg = 11; /* vector float */ + const int mask_reg = 12; /* vector uint */ + + /* offset of quad from start of tile + * XXX assuming 4-byte pixels for color AND Z/stencil!!!! + */ + int quad_offset_reg; + + int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */ + int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */ + + spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + spe_allocate_register(f, x_reg); + spe_allocate_register(f, y_reg); + spe_allocate_register(f, color_tile_reg); + spe_allocate_register(f, depth_tile_reg); + spe_allocate_register(f, fragZ_reg); + spe_allocate_register(f, fragR_reg); + spe_allocate_register(f, fragG_reg); + spe_allocate_register(f, fragB_reg); + spe_allocate_register(f, fragA_reg); + spe_allocate_register(f, mask_reg); + + quad_offset_reg = spe_allocate_available_register(f); + fbRGBA_reg = spe_allocate_available_register(f); + fbZS_reg = spe_allocate_available_register(f); + + /* compute offset of quad from start of tile, in bytes */ + { + int x2_reg = spe_allocate_available_register(f); + int y2_reg = spe_allocate_available_register(f); + + ASSERT(TILE_SIZE == 32); + + spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */ + spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */ + spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */ + spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */ + spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */ + + spe_release_register(f, x2_reg); + spe_release_register(f, y2_reg); + } + + + if (dsa->alpha.enabled) { + gen_alpha_test(dsa, f, mask_reg, fragA_reg); + } + + if (dsa->depth.enabled || dsa->stencil[0].enabled) { + const enum pipe_format zs_format = cell->framebuffer.zsbuf->format; + boolean write_depth_stencil; + + int fbZ_reg = spe_allocate_available_register(f); /* Z values */ + int fbS_reg = spe_allocate_available_register(f); /* Stencil values */ + + /* fetch quad of depth/stencil values from tile at (x,y) */ + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + + if (dsa->depth.enabled) { + /* Extract Z bits from fbZS_reg into fbZ_reg */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + int mask_reg = spe_allocate_available_register(f); + spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */ + spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */ + spe_release_register(f, mask_reg); + /* OK, fbZ_reg has four 24-bit Z values now */ + } + else { + /* XXX handle other z/stencil formats */ + ASSERT(0); + } + + /* Convert fragZ values from float[4] to uint[4] */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM || + zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + /* 24-bit Z values */ + int scale_reg = spe_allocate_available_register(f); + + /* scale_reg[0,1,2,3] = float(2^24-1) */ + spe_load_float(f, scale_reg, (float) 0xffffff); + + /* XXX these two instructions might be combined */ + spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */ + + spe_release_register(f, scale_reg); + } + else { + /* XXX handle 16-bit Z format */ + ASSERT(0); + } + } + + if (dsa->stencil[0].enabled) { + /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + /* XXX extract with a shift */ + ASSERT(0); + } + else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + /* XXX extract with a mask */ + ASSERT(0); + } + } + + + if (dsa->stencil[0].enabled) { + /* XXX this may involve depth testing too */ + // gen_stencil_test(dsa, f, ... ); + ASSERT(0); + } + else if (dsa->depth.enabled) { + int zmask_reg = spe_allocate_available_register(f); + gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); + spe_release_register(f, zmask_reg); + } + + /* do we need to write Z and/or Stencil back into framebuffer? */ + write_depth_stencil = (dsa->depth.writemask | + dsa->stencil[0].write_mask | + dsa->stencil[1].write_mask); + + if (write_depth_stencil) { + /* Merge latest Z and Stencil values into fbZS_reg. + * fbZ_reg has four Z vals in bits [23..0] or bits [15..0]. + * fbS_reg has four 8-bit Z values in bits [7..0]. + */ + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } + else if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + /* XXX to do */ + ASSERT(0); + } + else if (zs_format == PIPE_FORMAT_Z16_UNORM) { + /* XXX to do */ + ASSERT(0); + } + else if (zs_format == PIPE_FORMAT_S8_UNORM) { + /* XXX to do */ + ASSERT(0); + } + else { + /* bad zs_format */ + ASSERT(0); + } + + /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */ + spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + } + + spe_release_register(f, fbZ_reg); + spe_release_register(f, fbS_reg); + } + + + /* Get framebuffer quad/colors. We'll need these for blending, + * color masking, and to obey the quad/pixel mask. + * Load: fbRGBA_reg = memory[color_tile + quad_offset] + * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking + * we could skip this load. + */ + spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg); + + + if (blend->blend_enable) { + /* convert packed tile colors in fbRGBA_reg to float[4] vectors */ + + // gen_blend_code(blend, f, mask_reg, ... ); + + } + + + + /* + * Write fragment colors to framebuffer/tile. + * This involves converting the fragment colors from float[4] to the + * tile's specific format and obeying the quad/pixel mask. + */ + { + const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; + int rgba_reg = spe_allocate_available_register(f); + + /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ + spe_cfltu(f, fragR_reg, fragR_reg, 32); + spe_cfltu(f, fragG_reg, fragG_reg, 32); + spe_cfltu(f, fragB_reg, fragB_reg, 32); + spe_cfltu(f, fragA_reg, fragA_reg, 32); + + /* Shift most the significant bytes to least the significant positions. + * I.e.: reg = reg >> 24 + */ + spe_rotmi(f, fragR_reg, fragR_reg, -24); + spe_rotmi(f, fragG_reg, fragG_reg, -24); + spe_rotmi(f, fragB_reg, fragB_reg, -24); + spe_rotmi(f, fragA_reg, fragA_reg, -24); + + /* Shift the color bytes according to the surface format */ + if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) { + spe_roti(f, fragG_reg, fragG_reg, 8); /* green <<= 8 */ + spe_roti(f, fragR_reg, fragR_reg, 16); /* red <<= 16 */ + spe_roti(f, fragA_reg, fragA_reg, 24); /* alpha <<= 24 */ + } + else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) { + spe_roti(f, fragR_reg, fragR_reg, 8); /* red <<= 8 */ + spe_roti(f, fragG_reg, fragG_reg, 16); /* green <<= 16 */ + spe_roti(f, fragB_reg, fragB_reg, 24); /* blue <<= 24 */ + } + else { + ASSERT(0); + } + + /* Merge red, green, blue, alpha registers to make packed RGBA colors. + * Eg: after shifting according to color_format we might have: + * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000} + * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600} + * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099} + * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000} + * OR-ing all those together gives us four packed colors: + * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} + */ + spe_or(f, rgba_reg, fragR_reg, fragG_reg); + spe_or(f, rgba_reg, rgba_reg, fragB_reg); + spe_or(f, rgba_reg, rgba_reg, fragA_reg); + + /* Mix fragment colors with framebuffer colors using the quad/pixel mask: + * if (mask[i]) + * rgba[i] = rgba[i]; + * else + * rgba[i] = framebuffer[i]; + */ + spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg); + + /* Store updated quad in tile: + * memory[color_tile + quad_offset] = rgba_reg; + */ + spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg); + + spe_release_register(f, rgba_reg); + } + + printf("gen_fragment_ops nr instructions: %u\n", f->num_inst); + + spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */ + + + spe_release_register(f, fbRGBA_reg); + spe_release_register(f, fbZS_reg); + spe_release_register(f, quad_offset_reg); +} + diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h new file mode 100644 index 0000000000..0ea0fc690c --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h @@ -0,0 +1,38 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_GEN_FRAGMENT_H +#define CELL_GEN_FRAGMENT_H + + +extern void +gen_fragment_function(struct cell_context *cell, struct spe_function *f); + + +#endif /* CELL_GEN_FRAGMENT_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index f2feaa329a..06777aac14 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -27,6 +27,7 @@ #include "util/u_memory.h" #include "cell_context.h" +#include "cell_gen_fragment.h" #include "cell_state.h" #include "cell_state_emit.h" #include "cell_state_per_fragment.h" @@ -83,6 +84,29 @@ cell_emit_state(struct cell_context *cell) fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE; fb->width = cell->framebuffer.width; fb->height = cell->framebuffer.height; +#if 0 + printf("EMIT color format %s\n", pf_name(fb->color_format)); + printf("EMIT depth format %s\n", pf_name(fb->depth_format)); +#endif + } + + + if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) { + /* XXX we don't want to always do codegen here. We should have + * a hash/lookup table to cache previous results... + */ + struct cell_command_fragment_ops *fops + = cell_batch_alloc(cell, sizeof(*fops)); + struct spe_function spe_code; + + /* generate new code */ + gen_fragment_function(cell, &spe_code); + /* put the new code into the batch buffer */ + fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; + memcpy(&fops->code, spe_code.store, + SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + /* free codegen buffer */ + spe_release_func(&spe_code); } if (cell->dirty & CELL_NEW_BLEND) { @@ -90,8 +114,7 @@ cell_emit_state(struct cell_context *cell) if (cell->blend != NULL) { blend.base = (intptr_t) cell->blend->code.store; - blend.size = (char *) cell->blend->code.csr - - (char *) cell->blend->code.store; + blend.size = cell->blend->code.num_inst * SPE_INST_SIZE; blend.read_fb = TRUE; } else { @@ -108,10 +131,10 @@ cell_emit_state(struct cell_context *cell) if (cell->depth_stencil != NULL) { dsat.base = (intptr_t) cell->depth_stencil->code.store; - dsat.size = (char *) cell->depth_stencil->code.csr - - (char *) cell->depth_stencil->code.store; + dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE; dsat.read_depth = TRUE; dsat.read_stencil = FALSE; + dsat.state = cell->depth_stencil->base; } else { dsat.base = 0; diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c index 705867107b..78cb446c14 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c @@ -1158,7 +1158,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb) static int PC_OFFSET(const struct spe_function *f, const void *d) { - const intptr_t pc = (intptr_t) f->csr; + const intptr_t pc = (intptr_t) &f->store[f->num_inst]; const intptr_t ea = ~0x0f & (intptr_t) d; return (ea - pc) >> 2; diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index d49abb2e82..e285ae9fdb 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -43,7 +43,7 @@ INCLUDE_DIRS = \ $(SPU_CC) $(SPU_CFLAGS) -c $< .c.s: - $(SPU_CC) $(SPU_CFLAGS) -S $< + $(SPU_CC) $(SPU_CFLAGS) -O3 -S $< # The .a file will be linked into the main/PPU executable diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index c4236817a9..4e0ec15925 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -34,6 +34,7 @@ #include "spu_main.h" #include "spu_render.h" +#include "spu_per_fragment_op.h" #include "spu_texture.h" #include "spu_tile.h" //#include "spu_test.h" @@ -46,7 +47,7 @@ /* helpful headers: /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h -/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h +/opt/cell/sdk/usr/include/libmisc.h */ boolean Debug = FALSE; @@ -226,6 +227,24 @@ cmd_release_verts(const struct cell_command_release_verts *release) } +/** + * Process a CELL_CMD_STATE_FRAGMENT_OPS command. + * This involves installing new fragment ops SPU code. + * If this function is never called, we'll use a regular C fallback function + * for fragment processing. + */ +static void +cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) +{ + if (Debug) + printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); + /* Copy SPU code from batch buffer to spu buffer */ + memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + /* Point function pointer at new code */ + spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code; +} + + static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { @@ -257,6 +276,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) break; case PIPE_FORMAT_Z24S8_UNORM: case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: spu.fb.zsize = 4; spu.fb.zscale = (float) 0x00ffffffu; break; @@ -282,6 +303,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) } +#define NEW_FRAGMENT_FUNCTION 01 + static void cmd_state_blend(const struct cell_command_blend *state) { @@ -302,7 +325,9 @@ cmd_state_blend(const struct cell_command_blend *state) wait_on_mask(1 << TAG_BATCH_BUFFER); spu.blend = (blend_func) fb_blend_code_buffer; spu.read_fb = state->read_fb; - } else { + } + else + { spu.read_fb = FALSE; } } @@ -326,7 +351,9 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat 0, /* tid */ 0 /* rid */); wait_on_mask(1 << TAG_BATCH_BUFFER); - } else { + } + else + { /* If there is no code, emit a return instruction. */ depth_stencil_code_buffer[0] = 0x35; @@ -338,12 +365,14 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat spu.frag_test = (frag_test_func) depth_stencil_code_buffer; spu.read_depth = state->read_depth; spu.read_stencil = state->read_stencil; + spu.depth_stencil_alpha = state->state; } static void cmd_state_logicop(const struct cell_command_logicop * code) { +#if !NEW_FRAGMENT_FUNCTION mfc_get(logicop_code_buffer, (unsigned int) code->base, /* src */ code->size, @@ -353,6 +382,7 @@ cmd_state_logicop(const struct cell_command_logicop * code) wait_on_mask(1 << TAG_BATCH_BUFFER); spu.logicop = (logicop_func) logicop_code_buffer; +#endif } @@ -455,7 +485,9 @@ cmd_finish(void) /** - * Execute a batch of commands + * Execute a batch of commands which was sent to us by the PPU. + * See the cell_emit_state.c code to see where the commands come from. + * * The opcode param encodes the location of the buffer and its size. */ static void @@ -519,6 +551,14 @@ cmd_batch(uint opcode) pos += pos_incr; } break; + case CELL_CMD_STATE_FRAGMENT_OPS: + { + struct cell_command_fragment_ops *fops + = (struct cell_command_fragment_ops *) &buffer[pos]; + cmd_state_fragment_ops(fops); + pos += sizeof(*fops) / 8; + } + break; case CELL_CMD_RELEASE_VERTS: { struct cell_command_release_verts *release @@ -680,6 +720,11 @@ one_time_init(void) memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status)); memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status)); invalidate_tex_cache(); + + /* Install default/fallback fragment processing function. + * This will normally be overriden by a code-gen'd function. + */ + spu.fragment_ops.func = spu_fallback_fragment_ops; } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index c2a53c9dcf..7ab34f5222 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -91,6 +91,24 @@ typedef struct spu_blend_results (*logicop_func)( typedef vector float (*sample_texture_func)(uint unit, vector float texcoord); + +typedef void (*spu_fragment_ops_func)(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask); + +struct spu_fragment_ops +{ + uint code[SPU_MAX_FRAGMENT_OPS_INSTS]; + spu_fragment_ops_func func; /**< Current fragment ops function */ +} ALIGN16_ATTRIB; + + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ void *depth_start; /**< addr of depth surface in main memory */ @@ -127,6 +145,9 @@ struct spu_global struct cell_init_info init; struct spu_framebuffer fb; + + struct pipe_depth_stencil_alpha_state depth_stencil_alpha; + boolean read_depth; boolean read_stencil; frag_test_func frag_test; /**< Current depth/stencil test code */ @@ -142,6 +163,8 @@ struct spu_global struct vertex_info vertex_info; + struct spu_fragment_ops fragment_ops; + /* XXX more state to come */ diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index 29dc07a2e8..ffc596aa62 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -29,8 +29,11 @@ * \author Ian Romanick */ + +#include #include "pipe/p_format.h" #include "spu_main.h" +#include "spu_colorpack.h" #include "spu_per_fragment_op.h" #define ZERO 0x80 @@ -90,7 +93,8 @@ read_ds_quad(tile_t *tile, unsigned x, unsigned y, break; } - case PIPE_FORMAT_S8Z24_UNORM: { + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: { qword *ptr = (qword *) &tile->ui4[iy][ix]; *depth = si_and(*ptr, si_fsmbi(0x7777)); @@ -153,7 +157,8 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y, break; } - case PIPE_FORMAT_S8Z24_UNORM: { + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: { qword *ptr = (qword *) &buffer->ui4[iy][ix]; /* form select mask = 0111,0111,0111,0111 */ qword mask = si_fsmbi(0x7777); @@ -217,3 +222,225 @@ spu_do_depth_stencil(int x, int y, return result.mask; } + + + + +/** + * Called by rasterizer for each quad after the shader has run. This + * is a fallback/debug function. In reality we'll use a generated + * function produced by the PPU. But this function is useful for + * debug/validation. + */ +void +spu_fallback_fragment_ops(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask) +{ + vector float frag_soa[4], frag_aos[4]; + unsigned int c0, c1, c2, c3; + + /* do alpha test */ + if (spu.depth_stencil_alpha.alpha.enabled) { + vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref); + vector unsigned int amask; + + switch (spu.depth_stencil_alpha.alpha.func) { + case PIPE_FUNC_LESS: + amask = spu_cmpgt(ref, fragAlpha); /* mask = (fragAlpha < ref) */ + break; + case PIPE_FUNC_GREATER: + amask = spu_cmpgt(fragAlpha, ref); /* mask = (fragAlpha > ref) */ + break; + case PIPE_FUNC_GEQUAL: + amask = spu_cmpgt(ref, fragAlpha); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_LEQUAL: + amask = spu_cmpgt(fragAlpha, ref); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_EQUAL: + amask = spu_cmpeq(ref, fragAlpha); + break; + case PIPE_FUNC_NOTEQUAL: + amask = spu_cmpeq(ref, fragAlpha); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_ALWAYS: + amask = spu_splats(0xffffffffU); + break; + case PIPE_FUNC_NEVER: + amask = spu_splats( 0x0U); + break; + default: + ; + } + + mask = spu_and(mask, amask); + } + + /* Z and/or stencil testing... */ + if (spu.depth_stencil_alpha.depth.enabled || + spu.depth_stencil_alpha.stencil[0].enabled) { + + /* get four Z/Stencil values from tile */ + vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU); + vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2]; + vector unsigned int ifbZ = spu_and(ifbZS, mask24); + vector unsigned int ifbS = spu_andc(ifbZS, mask24); + + if (spu.depth_stencil_alpha.stencil[0].enabled) { + /* do stencil test */ + ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM); + + } + else if (spu.depth_stencil_alpha.depth.enabled) { + /* do depth test */ + + ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM || + spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM); + + vector unsigned int ifragZ; + vector unsigned int zmask; + + /* convert four fragZ from float to uint */ + fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff)); + ifragZ = spu_convtu(fragZ, 0); + + /* do depth comparison, setting zmask with results */ + switch (spu.depth_stencil_alpha.depth.func) { + case PIPE_FUNC_LESS: + zmask = spu_cmpgt(ifbZ, ifragZ); /* mask = (ifragZ < ifbZ) */ + break; + case PIPE_FUNC_GREATER: + zmask = spu_cmpgt(ifragZ, ifbZ); /* mask = (ifbZ > ifragZ) */ + break; + case PIPE_FUNC_GEQUAL: + zmask = spu_cmpgt(ifbZ, ifragZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_LEQUAL: + zmask = spu_cmpgt(ifragZ, ifbZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_EQUAL: + zmask = spu_cmpeq(ifbZ, ifragZ); + break; + case PIPE_FUNC_NOTEQUAL: + zmask = spu_cmpeq(ifbZ, ifragZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_ALWAYS: + zmask = spu_splats(0xffffffffU); + break; + case PIPE_FUNC_NEVER: + zmask = spu_splats( 0x0U); + break; + default: + ; + } + + mask = spu_and(mask, zmask); + + /* merge framebuffer Z and fragment Z according to the mask */ + ifbZ = spu_or(spu_and(ifragZ, mask), + spu_andc(ifbZ, mask)); + } + + if (spu_extract(spu_orx(mask), 0)) { + /* put new fragment Z/Stencil values back into Z/Stencil tile */ + depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS); + + spu.cur_ztile_status = TILE_STATUS_DIRTY; + } + } + + /* XXX do blending here */ + + /* XXX do colormask test here */ + + + if (spu_extract(spu_orx(mask), 0)) { + spu.cur_ctile_status = TILE_STATUS_DIRTY; + } + else { + return; + } + + /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */ +#if 0 + { + vector float frag_soa[4]; + frag_soa[0] = fragRed; + frag_soa[1] = fragGreen; + frag_soa[2] = fragBlue; + frag_soa[3] = fragAlpha; + _transpose_matrix4x4(frag_aos, frag_soa); + } +#else + /* short-cut relying on function parameter layout: */ + _transpose_matrix4x4(frag_aos, &fragRed); + (void) fragGreen; + (void) fragBlue; +#endif + + switch (spu.fb.color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + c0 = spu_pack_A8R8G8B8(frag_aos[0]); + c1 = spu_pack_A8R8G8B8(frag_aos[1]); + c2 = spu_pack_A8R8G8B8(frag_aos[2]); + c3 = spu_pack_A8R8G8B8(frag_aos[3]); + break; + + case PIPE_FORMAT_B8G8R8A8_UNORM: + c0 = spu_pack_B8G8R8A8(frag_aos[0]); + c1 = spu_pack_B8G8R8A8(frag_aos[1]); + c2 = spu_pack_B8G8R8A8(frag_aos[2]); + c3 = spu_pack_B8G8R8A8(frag_aos[3]); + break; + default: + fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n"); + ASSERT(0); + } + +#if 0 + /* + * Quad layout: + * +--+--+ + * |p0|p1| + * +--+--+ + * |p2|p3| + * +--+--+ + */ + if (spu_extract(mask, 0)) + colorTile->ui[y+0][x+0] = c0; + if (spu_extract(mask, 1)) + colorTile->ui[y+0][x+1] = c1; + if (spu_extract(mask, 2)) + colorTile->ui[y+1][x+0] = c2; + if (spu_extract(mask, 3)) + colorTile->ui[y+1][x+1] = c3; +#else + /* + * Quad layout: + * +--+--+--+--+ + * |p0|p1|p2|p3| + * +--+--+--+--+ + */ + if (spu_extract(mask, 0)) + colorTile->ui[y][x*2] = c0; + if (spu_extract(mask, 1)) + colorTile->ui[y][x*2+1] = c1; + if (spu_extract(mask, 2)) + colorTile->ui[y][x*2+2] = c2; + if (spu_extract(mask, 3)) + colorTile->ui[y][x*2+3] = c3; +#endif +} diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h index 6571258699..ffadf0661c 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -29,4 +29,15 @@ extern qword spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth, qword frag_alpha, qword facing); +extern void +spu_fallback_fragment_ops(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask); + #endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index a3ea0a3e69..71ef6ca24f 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -297,9 +297,12 @@ emit_quad( int x, int y, mask_t mask ) sp->quad.first->run(sp->quad.first, &setup.quad); #else +#define NEW_FRAGMENT_FUNCTION 01 +#if !NEW_FRAGMENT_FUNCTION if (spu.read_depth) { mask = do_depth_test(x, y, mask); } +#endif /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { @@ -308,6 +311,7 @@ emit_quad( int x, int y, mask_t mask ) vector float colors[4]; spu.cur_ctile_status = TILE_STATUS_DIRTY; + spu.cur_ztile_status = TILE_STATUS_DIRTY; if (spu.texture[0].start) { /* texture mapping */ @@ -355,6 +359,29 @@ emit_quad( int x, int y, mask_t mask ) } +#if NEW_FRAGMENT_FUNCTION + { + /* Convert fragment data from AoS to SoA format. + * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) + * This is temporary! + */ + vector float soa_frag[4]; + _transpose_matrix4x4(soa_frag, colors); + + float4 fragZ; + + fragZ.v = eval_z((float) x, (float) y); + + /* Do all per-fragment/quad operations here, including: + * alpha test, z test, stencil test, blend and framebuffer writing. + */ + spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile, + fragZ.v, + soa_frag[0], soa_frag[1], + soa_frag[2], soa_frag[3], + mask); + } +#else /* Convert fragment data from AoS to SoA format. * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) */ @@ -405,6 +432,9 @@ emit_quad( int x, int y, mask_t mask ) spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0); spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0); spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0); + +#endif /* NEW_FRAGMENT_FUNCTION */ + } #endif } diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c index b010513107..28bd6ceab4 100644 --- a/src/gallium/winsys/xlib/xm_api.c +++ b/src/gallium/winsys/xlib/xm_api.c @@ -349,12 +349,17 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type, if (vis->mesa_visual.depthBits == 0) depthFormat = PIPE_FORMAT_NONE; +#ifdef GALLIUM_CELL /* XXX temporary for Cell! */ + else + depthFormat = PIPE_FORMAT_S8Z24_UNORM; +#else else if (vis->mesa_visual.depthBits <= 16) - depthFormat = PIPE_FORMAT_Z16_UNORM; + depthFormat = PIPE_FORMAT_Z16UNORM; else if (vis->mesa_visual.depthBits <= 24) depthFormat = PIPE_FORMAT_S8Z24_UNORM; else depthFormat = PIPE_FORMAT_Z32_UNORM; +#endif if (vis->mesa_visual.stencilBits == 8) { if (depthFormat == PIPE_FORMAT_S8Z24_UNORM) diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c index 5e9a1f92f1..c4a30d3702 100644 --- a/src/gallium/winsys/xlib/xm_winsys.c +++ b/src/gallium/winsys/xlib/xm_winsys.c @@ -275,6 +275,39 @@ xm_buffer_destroy(struct pipe_winsys *pws, } +/** + * For Cell. Basically, rearrange the pixels/quads from this layout: + * +--+--+--+--+ + * |p0|p1|p2|p3|.... + * +--+--+--+--+ + * + * to this layout: + * +--+--+ + * |p0|p1|.... + * +--+--+ + * |p2|p3| + * +--+--+ + */ +static void +twiddle_tile(uint *tile) +{ + uint tile2[TILE_SIZE * TILE_SIZE]; + int y, x; + + for (y = 0; y < TILE_SIZE; y+=2) { + for (x = 0; x < TILE_SIZE; x+=2) { + int k = 4 * (y/2 * TILE_SIZE/2 + x/2); + tile2[y * TILE_SIZE + (x + 0)] = tile[k]; + tile2[y * TILE_SIZE + (x + 1)] = tile[k+1]; + tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2]; + tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3]; + } + } + memcpy(tile, tile2, sizeof(tile2)); +} + + + /** * Display a surface that's in a tiled configuration. That is, all the * pixels for a TILE_SIZExTILE_SIZE block are contiguous in memory. @@ -321,6 +354,8 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf) ximage->data = (char *) xm_buf->data + offset; + twiddle_tile((uint *) ximage->data); + if (XSHM_ENABLED(xm_buf)) { #if defined(USE_XSHM) && !defined(XFree86Server) XShmPutImage(b->xm_visual->display, b->drawable, b->gc, -- cgit v1.2.3 From 701fcee65db6b72f98e926d838956bbcc54f1cc6 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 10 Sep 2008 18:51:43 -0600 Subject: cell: remove old per-fragment code, replace with all new code --- src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 236 +++------------------ src/gallium/drivers/cell/spu/spu_per_fragment_op.h | 47 ++-- src/gallium/drivers/cell/spu/spu_tri.c | 96 --------- 3 files changed, 48 insertions(+), 331 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index ffc596aa62..9ed5fc50cd 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -1,32 +1,32 @@ -/* - * (C) Copyright IBM Corporation 2008 +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ /** - * \file spu_per_fragment_op.c - * SPU implementation various per-fragment operations. - * - * \author Ian Romanick + * \author Brian Paul */ @@ -36,194 +36,6 @@ #include "spu_colorpack.h" #include "spu_per_fragment_op.h" -#define ZERO 0x80 - - -/** - * Get a "quad" of four fragment Z/stencil values from the given tile. - * \param tile the tile of Z/stencil values - * \param x, y location of the quad in the tile, in pixels - * \param depth_format format of the tile's data - * \param detph returns four depth values - * \param stencil returns four stencil values - */ -static void -read_ds_quad(tile_t *tile, unsigned x, unsigned y, - enum pipe_format depth_format, qword *depth, - qword *stencil) -{ - const int ix = x / 2; - const int iy = y / 2; - - switch (depth_format) { - case PIPE_FORMAT_Z16_UNORM: { - qword *ptr = (qword *) &tile->us8[iy][ix / 2]; - - const qword shuf_vec = (qword) { - ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3, - ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7 - }; - - /* At even X values we want the first 4 shorts, and at odd X values we - * want the second 4 shorts. - */ - qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3)); - qword bias_mask = si_fsmbi(0x3333); - qword sv = si_a(shuf_vec, si_and(bias_mask, bias)); - - *depth = si_shufb(*ptr, *ptr, sv); - *stencil = si_il(0); - break; - } - - case PIPE_FORMAT_Z32_UNORM: { - qword *ptr = (qword *) &tile->ui4[iy][ix]; - - *depth = *ptr; - *stencil = si_il(0); - break; - } - - case PIPE_FORMAT_Z24S8_UNORM: { - qword *ptr = (qword *) &tile->ui4[iy][ix]; - qword mask = si_fsmbi(0xEEEE); - - *depth = si_rotmai(si_and(*ptr, mask), -8); - *stencil = si_andc(*ptr, mask); - break; - } - - case PIPE_FORMAT_S8Z24_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: { - qword *ptr = (qword *) &tile->ui4[iy][ix]; - - *depth = si_and(*ptr, si_fsmbi(0x7777)); - *stencil = si_andi(si_roti(*ptr, 8), 0x0ff); - break; - } - - default: - ASSERT(0); - break; - } -} - - -/** - * Put a quad of Z/stencil values into a tile. - * \param tile the tile of Z/stencil values to write into - * \param x, y location of the quad in the tile, in pixels - * \param depth_format format of the tile's data - * \param detph depth values to store - * \param stencil stencil values to store - */ -static void -write_ds_quad(tile_t *buffer, unsigned x, unsigned y, - enum pipe_format depth_format, - qword depth, qword stencil) -{ - const int ix = x / 2; - const int iy = y / 2; - - (void) stencil; - - switch (depth_format) { - case PIPE_FORMAT_Z16_UNORM: { - qword *ptr = (qword *) &buffer->us8[iy][ix / 2]; - - qword sv = ((ix & 0x01) == 0) - ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15, - 24, 25, 26, 27, 28, 29, 30, 31 } - : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23, - 2, 3, 6, 7, 10, 11, 14, 15 }; - *ptr = si_shufb(depth, *ptr, sv); - break; - } - - case PIPE_FORMAT_Z32_UNORM: { - qword *ptr = (qword *) &buffer->ui4[iy][ix]; - *ptr = depth; - break; - } - - case PIPE_FORMAT_Z24S8_UNORM: { - qword *ptr = (qword *) &buffer->ui4[iy][ix]; - /* form select mask = 1110,1110,1110,1110 */ - qword mask = si_fsmbi(0xEEEE); - /* depth[i] = depth[i] << 8 */ - depth = si_shli(depth, 8); - /* *ptr[i] = depth[i][31:8] | stencil[i][7:0] */ - *ptr = si_selb(stencil, depth, mask); - break; - } - - case PIPE_FORMAT_S8Z24_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: { - qword *ptr = (qword *) &buffer->ui4[iy][ix]; - /* form select mask = 0111,0111,0111,0111 */ - qword mask = si_fsmbi(0x7777); - /* stencil[i] = stencil[i] << 24 */ - stencil = si_shli(stencil, 24); - /* *ptr[i] = stencil[i][31:24] | depth[i][23:0] */ - *ptr = si_selb(stencil, depth, mask); - break; - } - - default: - ASSERT(0); - break; - } -} - - -/** - * Do depth/stencil/alpha test for a "quad" of 4 fragments. - * \param x,y location of quad within tile - * \param frag_mask indicates which fragments are "alive" - * \param frag_depth four fragment depth values - * \param frag_alpha four fragment alpha values - * \param facing front/back facing for four fragments (1=front, 0=back) - */ -qword -spu_do_depth_stencil(int x, int y, - qword frag_mask, qword frag_depth, qword frag_alpha, - qword facing) -{ - struct spu_frag_test_results result; - qword pixel_depth; - qword pixel_stencil; - - /* All of this preable code (everthing before the call to frag_test) should - * be generated on the PPU and upload to the SPU. - */ - if (spu.read_depth || spu.read_stencil) { - read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format, - &pixel_depth, &pixel_stencil); - } - - /* convert floating point Z values to 32-bit uint */ - - /* frag_depth *= spu.fb.zscale */ - frag_depth = si_fm(frag_depth, (qword)spu_splats(spu.fb.zscale)); - /* frag_depth = uint(frag_depth) */ - frag_depth = si_cfltu(frag_depth, 0); - - result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil, - frag_depth, frag_alpha, facing); - - - /* This code (everthing after the call to frag_test) should - * be generated on the PPU and upload to the SPU. - */ - if (spu.read_depth || spu.read_stencil) { - write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format, - result.depth, result.stencil); - } - - return result.mask; -} - - /** diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h index ffadf0661c..f817abf046 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -1,33 +1,33 @@ -/* - * (C) Copyright IBM Corporation 2008 +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ #ifndef SPU_PER_FRAGMENT_OP #define SPU_PER_FRAGMENT_OP -extern qword -spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth, - qword frag_alpha, qword facing); extern void spu_fallback_fragment_ops(uint x, uint y, @@ -40,4 +40,5 @@ spu_fallback_fragment_ops(uint x, uint y, vector float fragAlpha, vector unsigned int mask); + #endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 71ef6ca24f..a5bf3270c7 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -38,7 +38,6 @@ #include "spu_texture.h" #include "spu_tile.h" #include "spu_tri.h" -#include "spu_per_fragment_op.h" /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */ @@ -255,31 +254,6 @@ eval_z(float x, float y) } -static INLINE mask_t -do_depth_test(int x, int y, mask_t quadmask) -{ - float4 zvals; - mask_t mask; - - if (spu.fb.depth_format == PIPE_FORMAT_NONE) - return quadmask; - - zvals.v = eval_z((float) x, (float) y); - - mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx, - y - setup.cliprect_miny, - (qword) quadmask, - (qword) zvals.v, - (qword) spu_splats((unsigned char) 0x0ffu), - (qword) spu_splats((unsigned int) 0x01u)); - - if (spu_extract(spu_orx(mask), 0)) - spu.cur_ztile_status = TILE_STATUS_DIRTY; - - return mask; -} - - /** * Emit a quad (pass to next stage). No clipping is done. * Note: about 1/5 to 1/7 of the time, mask is zero and this function @@ -289,21 +263,6 @@ do_depth_test(int x, int y, mask_t quadmask) static INLINE void emit_quad( int x, int y, mask_t mask ) { -#if 0 - struct softpipe_context *sp = setup.softpipe; - setup.quad.x0 = x; - setup.quad.y0 = y; - setup.quad.mask = mask; - sp->quad.first->run(sp->quad.first, &setup.quad); -#else - -#define NEW_FRAGMENT_FUNCTION 01 -#if !NEW_FRAGMENT_FUNCTION - if (spu.read_depth) { - mask = do_depth_test(x, y, mask); - } -#endif - /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; @@ -359,7 +318,6 @@ emit_quad( int x, int y, mask_t mask ) } -#if NEW_FRAGMENT_FUNCTION { /* Convert fragment data from AoS to SoA format. * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) @@ -381,62 +339,8 @@ emit_quad( int x, int y, mask_t mask ) soa_frag[2], soa_frag[3], mask); } -#else - /* Convert fragment data from AoS to SoA format. - * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) - */ - qword soa_frag[4]; - _transpose_matrix4x4((vec_float4 *) soa_frag, colors); - - /* Read the current framebuffer values. - */ - const qword pix[4] = { - (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]), - (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]), - (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]), - (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]), - }; - - qword soa_pix[4]; - - if (spu.read_fb) { - /* Convert pixel data from AoS to SoA format. - * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) - */ - vec_float4 aos_pix[4] = { - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]), - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]), - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]), - spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]), - }; - - _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix); - } - - - struct spu_blend_results result = - (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3], - soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3], - spu.const_blend_color[0], spu.const_blend_color[1], - spu.const_blend_color[2], spu.const_blend_color[3]); - - - /* Convert final pixel data from SoA to AoS format. - * I.e. (RRRR,GGGG,BBBB,AAAA) -> (RGBA,RGBA,RGBA,RGBA) - */ - result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3], - result.r, result.g, result.b, result.a, - (qword) mask); - - spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0); - spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0); - spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0); - spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0); - -#endif /* NEW_FRAGMENT_FUNCTION */ } -#endif } -- cgit v1.2.3 From 5336e758a483d15d579ffe7cad536be95637d904 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 08:44:54 -0600 Subject: cell: added cast in spu_splats() call --- src/gallium/drivers/cell/spu/spu_texture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 5051774f00..117b8a36f8 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -97,7 +97,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels) const qword offset_y = si_andi((qword) y, 0x1f); const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row); - const qword tile_size = (qword) spu_splats(sizeof(tile_t)); + const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t)); qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x); tile_offset = si_mpy((qword) tile_offset, tile_size); -- cgit v1.2.3 From 6092a057042c9f7a4cae0f0eb9e95307f5f850a1 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 09:55:39 -0600 Subject: cell: fix shuffle in spu_unpack_B8G8R8A8() --- src/gallium/drivers/cell/spu/spu_colorpack.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h index e9fee8a3a6..fd8dc6ded3 100644 --- a/src/gallium/drivers/cell/spu/spu_colorpack.h +++ b/src/gallium/drivers/cell/spu/spu_colorpack.h @@ -79,14 +79,14 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle) static INLINE vector float -spu_unpack_color(uint color) +spu_unpack_B8G8R8A8(uint color) { vector unsigned int color_u4 = spu_splats(color); color_u4 = spu_shuffle(color_u4, color_u4, ((vector unsigned char) { - 0, 0, 0, 0, - 5, 5, 5, 5, 10, 10, 10, 10, + 5, 5, 5, 5, + 0, 0, 0, 0, 15, 15, 15, 15}) ); return spu_convtf(color_u4, 32); } -- cgit v1.2.3 From add86031db757b0e3abe48bd8fdea40d4e380e05 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:08:06 -0600 Subject: cell: begin new blending code (both codegen and fallback paths) --- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 420 ++++++++++++++++++--- src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 232 ++++++++++-- 2 files changed, 584 insertions(+), 68 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index df29476be6..7966c0916c 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -231,6 +231,370 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, +/** + * Generate SPE code to implement the given blend mode for a quad of pixels. + * \param f SPE function to append instruction onto. + * \param fragR_reg register with fragment red values (float) (in/out) + * \param fragG_reg register with fragment green values (float) (in/out) + * \param fragB_reg register with fragment blue values (float) (in/out) + * \param fragA_reg register with fragment alpha values (float) (in/out) + * \param fbRGBA_reg register with packed framebuffer colors (integer) (in) + */ +static void +gen_blend(const struct pipe_blend_state *blend, + struct spe_function *f, + enum pipe_format color_format, + int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg, + int fbRGBA_reg) +{ + int term1R_reg = spe_allocate_available_register(f); + int term1G_reg = spe_allocate_available_register(f); + int term1B_reg = spe_allocate_available_register(f); + int term1A_reg = spe_allocate_available_register(f); + + int term2R_reg = spe_allocate_available_register(f); + int term2G_reg = spe_allocate_available_register(f); + int term2B_reg = spe_allocate_available_register(f); + int term2A_reg = spe_allocate_available_register(f); + + int fbR_reg = spe_allocate_available_register(f); + int fbG_reg = spe_allocate_available_register(f); + int fbB_reg = spe_allocate_available_register(f); + int fbA_reg = spe_allocate_available_register(f); + + int one_reg = spe_allocate_available_register(f); + int tmp_reg = spe_allocate_available_register(f); + + ASSERT(blend->blend_enable); + + /* Unpack/convert framebuffer colors from four 32-bit packed colors + * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA). + * Each 8-bit color component is expanded into a float in [0.0, 1.0]. + */ + { + int mask_reg = spe_allocate_available_register(f); + + /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */ + spe_fsmbi(f, mask_reg, 0x1111); + + /* XXX there may be more clever ways to implement the following code */ + switch (color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + /* fbB = fbB & mask */ + spe_and(f, fbB_reg, fbRGBA_reg, mask_reg); + /* mask = mask << 8 */ + spe_roti(f, mask_reg, mask_reg, 8); + + /* fbG = fbRGBA & mask */ + spe_and(f, fbB_reg, fbRGBA_reg, mask_reg); + /* fbG = fbG >> 8 */ + spe_roti(f, fbB_reg, fbB_reg, -8); + /* mask = mask << 8 */ + spe_roti(f, mask_reg, mask_reg, 8); + + /* fbR = fbRGBA & mask */ + spe_and(f, fbR_reg, fbRGBA_reg, mask_reg); + /* fbR = fbR >> 16 */ + spe_roti(f, fbB_reg, fbB_reg, -16); + /* mask = mask << 8 */ + spe_roti(f, mask_reg, mask_reg, 8); + + /* fbA = fbRGBA & mask */ + spe_and(f, fbA_reg, fbRGBA_reg, mask_reg); + /* fbA = fbA >> 24 */ + spe_roti(f, fbA_reg, fbA_reg, -24); + break; + + case PIPE_FORMAT_B8G8R8A8_UNORM: + /* fbA = fbA & mask */ + spe_and(f, fbA_reg, fbRGBA_reg, mask_reg); + /* mask = mask << 8 */ + spe_roti(f, mask_reg, mask_reg, 8); + + /* fbR = fbRGBA & mask */ + spe_and(f, fbR_reg, fbRGBA_reg, mask_reg); + /* fbR = fbR >> 8 */ + spe_roti(f, fbR_reg, fbR_reg, -8); + /* mask = mask << 8 */ + spe_roti(f, mask_reg, mask_reg, 8); + + /* fbG = fbRGBA & mask */ + spe_and(f, fbG_reg, fbRGBA_reg, mask_reg); + /* fbG = fbG >> 16 */ + spe_roti(f, fbG_reg, fbG_reg, -16); + /* mask = mask << 8 */ + spe_roti(f, mask_reg, mask_reg, 8); + + /* fbB = fbRGBA & mask */ + spe_and(f, fbB_reg, fbRGBA_reg, mask_reg); + /* fbB = fbB >> 24 */ + spe_roti(f, fbB_reg, fbB_reg, -24); + break; + + default: + ASSERT(0); + } + + /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */ + spe_cuflt(f, fbR_reg, fbR_reg, 8); + spe_cuflt(f, fbG_reg, fbG_reg, 8); + spe_cuflt(f, fbB_reg, fbB_reg, 8); + spe_cuflt(f, fbA_reg, fbA_reg, 8); + + spe_release_register(f, mask_reg); + } + + + /* + * Compute Src RGB terms + */ + switch (blend->rgb_src_factor) { + case PIPE_BLENDFACTOR_ONE: + spe_move(f, term1R_reg, fragR_reg); + spe_move(f, term1G_reg, fragG_reg); + spe_move(f, term1B_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_ZERO: + spe_zero(f, term1R_reg); + spe_zero(f, term1G_reg); + spe_zero(f, term1B_reg); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + spe_fm(f, term1R_reg, fragR_reg, fragR_reg); + spe_fm(f, term1G_reg, fragG_reg, fragG_reg); + spe_fm(f, term1B_reg, fragB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + spe_fm(f, term1R_reg, fragR_reg, fragA_reg); + spe_fm(f, term1G_reg, fragG_reg, fragA_reg); + spe_fm(f, term1B_reg, fragB_reg, fragA_reg); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Src Alpha term + */ + switch (blend->alpha_src_factor) { + case PIPE_BLENDFACTOR_ONE: + spe_move(f, term1A_reg, fragA_reg); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + spe_fm(f, term1A_reg, fragA_reg, fragA_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + spe_fm(f, term1A_reg, fragA_reg, fragA_reg); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Dest RGB terms + */ + switch (blend->rgb_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + spe_move(f, term2R_reg, fbR_reg); + spe_move(f, term2G_reg, fbG_reg); + spe_move(f, term2B_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_ZERO: + spe_zero(f, term2R_reg); + spe_zero(f, term2G_reg); + spe_zero(f, term2B_reg); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + spe_fm(f, term2R_reg, fbR_reg, fragR_reg); + spe_fm(f, term2G_reg, fbG_reg, fragG_reg); + spe_fm(f, term2B_reg, fbB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + spe_fm(f, term2R_reg, fbR_reg, fragA_reg); + spe_fm(f, term2G_reg, fbG_reg, fragA_reg); + spe_fm(f, term2B_reg, fbB_reg, fragA_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + /* one = {1.0, 1.0, 1.0, 1.0} */ + spe_load_float(f, one_reg, 1.0f); + /* tmp = one - fragA */ + spe_fs(f, tmp_reg, one_reg, fragA_reg); + /* term = fb * tmp */ + spe_fm(f, term2R_reg, fbR_reg, tmp_reg); + spe_fm(f, term2G_reg, fbG_reg, tmp_reg); + spe_fm(f, term2B_reg, fbB_reg, tmp_reg); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Dest Alpha term + */ + switch (blend->alpha_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + spe_move(f, term2A_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_ZERO: + spe_zero(f, term2A_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + spe_fm(f, term2A_reg, fbA_reg, fragA_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + /* one = {1.0, 1.0, 1.0, 1.0} */ + spe_load_float(f, one_reg, 1.0f); + /* tmp = one - fragA */ + spe_fs(f, tmp_reg, one_reg, fragA_reg); + /* termA = fbA * tmp */ + spe_fm(f, term2A_reg, fbA_reg, tmp_reg); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Combine Src/Dest RGB terms + */ + switch (blend->rgb_func) { + case PIPE_BLEND_ADD: + spe_fa(f, fragR_reg, term1R_reg, term2R_reg); + spe_fa(f, fragG_reg, term1G_reg, term2G_reg); + spe_fa(f, fragB_reg, term1B_reg, term2B_reg); + break; + case PIPE_BLEND_SUBTRACT: + spe_fs(f, fragR_reg, term1R_reg, term2R_reg); + spe_fs(f, fragG_reg, term1G_reg, term2G_reg); + spe_fs(f, fragB_reg, term1B_reg, term2B_reg); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Combine Src/Dest A term + */ + switch (blend->alpha_func) { + case PIPE_BLEND_ADD: + spe_fa(f, fragA_reg, term1A_reg, term2A_reg); + break; + case PIPE_BLEND_SUBTRACT: + spe_fs(f, fragA_reg, term1A_reg, term2A_reg); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + spe_release_register(f, term1R_reg); + spe_release_register(f, term1G_reg); + spe_release_register(f, term1B_reg); + spe_release_register(f, term1A_reg); + + spe_release_register(f, term2R_reg); + spe_release_register(f, term2G_reg); + spe_release_register(f, term2B_reg); + spe_release_register(f, term2A_reg); + + spe_release_register(f, fbR_reg); + spe_release_register(f, fbG_reg); + spe_release_register(f, fbB_reg); + spe_release_register(f, fbA_reg); + + spe_release_register(f, one_reg); + spe_release_register(f, tmp_reg); +} + + +static void +gen_logicop(const struct pipe_blend_state *blend, + struct spe_function *f, + int fragRGBA_reg, int fbRGBA_reg) +{ + /* XXX to-do */ + /* operate on 32-bit packed pixels, not float colors */ +} + + +static void +gen_colormask(uint colormask, + struct spe_function *f, + int fragRGBA_reg, int fbRGBA_reg) +{ + /* XXX to-do */ + /* operate on 32-bit packed pixels, not float colors */ +} + + + +/** + * Generate code to pack a quad of float colors into a four 32-bit integers. + * + * \param f SPE function to append instruction onto. + * \param color_format the dest color packing format + * \param r_reg register containing four red values (in/clobbered) + * \param g_reg register containing four green values (in/clobbered) + * \param b_reg register containing four blue values (in/clobbered) + * \param a_reg register containing four alpha values (in/clobbered) + * \param rgba_reg register to store the packed RGBA colors (out) + */ +static void +gen_pack_colors(struct spe_function *f, + enum pipe_format color_format, + int r_reg, int g_reg, int b_reg, int a_reg, + int rgba_reg) +{ + /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ + spe_cfltu(f, r_reg, r_reg, 32); + spe_cfltu(f, g_reg, g_reg, 32); + spe_cfltu(f, b_reg, b_reg, 32); + spe_cfltu(f, a_reg, a_reg, 32); + + /* Shift the most significant bytes to least the significant positions. + * I.e.: reg = reg >> 24 + */ + spe_rotmi(f, r_reg, r_reg, -24); + spe_rotmi(f, g_reg, g_reg, -24); + spe_rotmi(f, b_reg, b_reg, -24); + spe_rotmi(f, a_reg, a_reg, -24); + + /* Shift the color bytes according to the surface format */ + if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) { + spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */ + spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */ + spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */ + } + else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) { + spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */ + spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */ + spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */ + } + else { + ASSERT(0); + } + + /* Merge red, green, blue, alpha registers to make packed RGBA colors. + * Eg: after shifting according to color_format we might have: + * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000} + * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600} + * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099} + * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000} + * OR-ing all those together gives us four packed colors: + * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} + */ + spe_or(f, rgba_reg, r_reg, g_reg); + spe_or(f, rgba_reg, rgba_reg, b_reg); + spe_or(f, rgba_reg, rgba_reg, a_reg); +} + + + + /** * Generate SPE code to implement the fragment operations (alpha test, * depth test, stencil test, blending, colormask, and final @@ -257,6 +621,7 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f) const struct pipe_depth_stencil_alpha_state *dsa = &cell->depth_stencil->base; const struct pipe_blend_state *blend = &cell->blend->base; + const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ const int x_reg = 3; /* uint */ @@ -443,64 +808,31 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f) if (blend->blend_enable) { - /* convert packed tile colors in fbRGBA_reg to float[4] vectors */ - - // gen_blend_code(blend, f, mask_reg, ... ); - + gen_blend(blend, f, color_format, + fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg); } - - /* * Write fragment colors to framebuffer/tile. * This involves converting the fragment colors from float[4] to the * tile's specific format and obeying the quad/pixel mask. */ { - const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; int rgba_reg = spe_allocate_available_register(f); - /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ - spe_cfltu(f, fragR_reg, fragR_reg, 32); - spe_cfltu(f, fragG_reg, fragG_reg, 32); - spe_cfltu(f, fragB_reg, fragB_reg, 32); - spe_cfltu(f, fragA_reg, fragA_reg, 32); + /* Pack four float colors as four 32-bit int colors */ + gen_pack_colors(f, color_format, + fragR_reg, fragG_reg, fragB_reg, fragA_reg, + rgba_reg); - /* Shift most the significant bytes to least the significant positions. - * I.e.: reg = reg >> 24 - */ - spe_rotmi(f, fragR_reg, fragR_reg, -24); - spe_rotmi(f, fragG_reg, fragG_reg, -24); - spe_rotmi(f, fragB_reg, fragB_reg, -24); - spe_rotmi(f, fragA_reg, fragA_reg, -24); - - /* Shift the color bytes according to the surface format */ - if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) { - spe_roti(f, fragG_reg, fragG_reg, 8); /* green <<= 8 */ - spe_roti(f, fragR_reg, fragR_reg, 16); /* red <<= 16 */ - spe_roti(f, fragA_reg, fragA_reg, 24); /* alpha <<= 24 */ - } - else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) { - spe_roti(f, fragR_reg, fragR_reg, 8); /* red <<= 8 */ - spe_roti(f, fragG_reg, fragG_reg, 16); /* green <<= 16 */ - spe_roti(f, fragB_reg, fragB_reg, 24); /* blue <<= 24 */ + if (blend->logicop_enable) { + gen_logicop(blend, f, rgba_reg, fbRGBA_reg); } - else { - ASSERT(0); + + if (blend->colormask != 0xf) { + gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg); } - /* Merge red, green, blue, alpha registers to make packed RGBA colors. - * Eg: after shifting according to color_format we might have: - * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000} - * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600} - * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099} - * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000} - * OR-ing all those together gives us four packed colors: - * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} - */ - spe_or(f, rgba_reg, fragR_reg, fragG_reg); - spe_or(f, rgba_reg, rgba_reg, fragB_reg); - spe_or(f, rgba_reg, rgba_reg, fragA_reg); /* Mix fragment colors with framebuffer colors using the quad/pixel mask: * if (mask[i]) diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index 9ed5fc50cd..3f0eabaa05 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -39,9 +39,11 @@ /** - * Called by rasterizer for each quad after the shader has run. This - * is a fallback/debug function. In reality we'll use a generated - * function produced by the PPU. But this function is useful for + * Called by rasterizer for each quad after the shader has run. Do + * all the per-fragment operations including alpha test, z test, + * stencil test, blend, colormask and logicops. This is a + * fallback/debug function. In reality we'll use a generated function + * produced by the PPU. But this function is useful for * debug/validation. */ void @@ -49,13 +51,13 @@ spu_fallback_fragment_ops(uint x, uint y, tile_t *colorTile, tile_t *depthStencilTile, vector float fragZ, - vector float fragRed, - vector float fragGreen, - vector float fragBlue, - vector float fragAlpha, + vector float fragR, + vector float fragG, + vector float fragB, + vector float fragA, vector unsigned int mask) { - vector float frag_soa[4], frag_aos[4]; + vector float frag_aos[4]; unsigned int c0, c1, c2, c3; /* do alpha test */ @@ -65,24 +67,24 @@ spu_fallback_fragment_ops(uint x, uint y, switch (spu.depth_stencil_alpha.alpha.func) { case PIPE_FUNC_LESS: - amask = spu_cmpgt(ref, fragAlpha); /* mask = (fragAlpha < ref) */ + amask = spu_cmpgt(ref, fragA); /* mask = (fragA < ref) */ break; case PIPE_FUNC_GREATER: - amask = spu_cmpgt(fragAlpha, ref); /* mask = (fragAlpha > ref) */ + amask = spu_cmpgt(fragA, ref); /* mask = (fragA > ref) */ break; case PIPE_FUNC_GEQUAL: - amask = spu_cmpgt(ref, fragAlpha); + amask = spu_cmpgt(ref, fragA); amask = spu_nor(amask, amask); break; case PIPE_FUNC_LEQUAL: - amask = spu_cmpgt(fragAlpha, ref); + amask = spu_cmpgt(fragA, ref); amask = spu_nor(amask, amask); break; case PIPE_FUNC_EQUAL: - amask = spu_cmpeq(ref, fragAlpha); + amask = spu_cmpeq(ref, fragA); break; case PIPE_FUNC_NOTEQUAL: - amask = spu_cmpeq(ref, fragAlpha); + amask = spu_cmpeq(ref, fragA); amask = spu_nor(amask, amask); break; case PIPE_FUNC_ALWAYS: @@ -174,7 +176,189 @@ spu_fallback_fragment_ops(uint x, uint y, } } - /* XXX do blending here */ + if (spu.blend.blend_enable) { + vector float term1r, term1g, term1b, term1a; + vector float term2r, term2g, term2b, term2a; + + vector float fbRGBA[4]; + + vector float one, tmp; + + /* get colors from framebuffer */ + { + vector float fc[4]; + uint c0, c1, c2, c3; +#if 0 + c0 = colorTile->ui[y+0][x+0]; + c1 = colorTile->ui[y+0][x+1]; + c2 = colorTile->ui[y+1][x+0]; + c3 = colorTile->ui[y+1][x+1]; +#else + c0 = colorTile->ui[y][x*2+0]; + c1 = colorTile->ui[y][x*2+1]; + c2 = colorTile->ui[y][x*2+2]; + c3 = colorTile->ui[y][x*2+3]; +#endif + switch (spu.fb.color_format) { + case PIPE_FORMAT_B8G8R8A8_UNORM: + fc[0] = spu_unpack_B8G8R8A8(c0); + fc[1] = spu_unpack_B8G8R8A8(c1); + fc[2] = spu_unpack_B8G8R8A8(c2); + fc[3] = spu_unpack_B8G8R8A8(c3); + break; + case PIPE_FORMAT_A8R8G8B8_UNORM: + fc[0] = spu_unpack_A8R8G8B8(c0); + fc[1] = spu_unpack_A8R8G8B8(c1); + fc[2] = spu_unpack_A8R8G8B8(c2); + fc[3] = spu_unpack_A8R8G8B8(c3); + break; + default: + ASSERT(0); + } + _transpose_matrix4x4(fbRGBA, fc); + } + + /* + * Compute Src RGB terms + */ + switch (spu.blend.rgb_src_factor) { + case PIPE_BLENDFACTOR_ONE: + term1r = fragR; + term1g = fragG; + term1b = fragB; + break; + case PIPE_BLENDFACTOR_ZERO: + term1r = + term1g = + term1b = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term1r = spu_mul(fragR, fragR); + term1g = spu_mul(fragG, fragG); + term1b = spu_mul(fragB, fragB); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term1r = spu_mul(fragR, fragA); + term1g = spu_mul(fragG, fragA); + term1b = spu_mul(fragB, fragA); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Src Alpha term + */ + switch (spu.blend.alpha_src_factor) { + case PIPE_BLENDFACTOR_ONE: + term1a = fragA; + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term1a = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term1a = spu_mul(fragA, fragA); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Dest RGB terms + */ + switch (spu.blend.rgb_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + term2r = fragR; + term2g = fragG; + term2b = fragB; + break; + case PIPE_BLENDFACTOR_ZERO: + term2r = + term2g = + term2b = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term2r = spu_mul(fbRGBA[0], fragR); + term2g = spu_mul(fbRGBA[1], fragG); + term2b = spu_mul(fbRGBA[2], fragB); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term2r = spu_mul(fbRGBA[0], fragA); + term2g = spu_mul(fbRGBA[1], fragA); + term2b = spu_mul(fbRGBA[2], fragA); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + one = spu_splats(1.0f); + tmp = spu_sub(one, fragA); + term2r = spu_mul(fbRGBA[0], tmp); + term2g = spu_mul(fbRGBA[1], tmp); + term2b = spu_mul(fbRGBA[2], tmp); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Dest Alpha term + */ + switch (spu.blend.alpha_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + term2a = fragA; + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term2a = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term2a = spu_mul(fbRGBA[3], fragA); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + one = spu_splats(1.0f); + tmp = spu_sub(one, fragA); + term2a = spu_mul(fbRGBA[3], tmp); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Combine Src/Dest RGB terms + */ + switch (spu.blend.rgb_func) { + case PIPE_BLEND_ADD: + fragR = spu_add(term1r, term2r); + fragG = spu_add(term1g, term2g); + fragB = spu_add(term1b, term2b); + break; + case PIPE_BLEND_SUBTRACT: + fragR = spu_sub(term1r, term2r); + fragG = spu_sub(term1g, term2g); + fragB = spu_sub(term1b, term2b); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Combine Src/Dest A term + */ + switch (spu.blend.alpha_func) { + case PIPE_BLEND_ADD: + fragA = spu_add(term1a, term2a); + break; + case PIPE_BLEND_SUBTRACT: + fragA = spu_sub(term1a, term2a); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + } + /* XXX do colormask test here */ @@ -190,17 +374,17 @@ spu_fallback_fragment_ops(uint x, uint y, #if 0 { vector float frag_soa[4]; - frag_soa[0] = fragRed; - frag_soa[1] = fragGreen; - frag_soa[2] = fragBlue; - frag_soa[3] = fragAlpha; + frag_soa[0] = fragR; + frag_soa[1] = fragG; + frag_soa[2] = fragB; + frag_soa[3] = fragA; _transpose_matrix4x4(frag_aos, frag_soa); } #else /* short-cut relying on function parameter layout: */ - _transpose_matrix4x4(frag_aos, &fragRed); - (void) fragGreen; - (void) fragBlue; + _transpose_matrix4x4(frag_aos, &fragR); + (void) fragG; + (void) fragB; #endif switch (spu.fb.color_format) { @@ -238,7 +422,7 @@ spu_fallback_fragment_ops(uint x, uint y, if (spu_extract(mask, 2)) colorTile->ui[y+1][x+0] = c2; if (spu_extract(mask, 3)) - colorTile->ui[y+1][x+1] = c3; + colorTile->ui[y+1][x+1] = c3; #else /* * Quad layout: @@ -253,6 +437,6 @@ spu_fallback_fragment_ops(uint x, uint y, if (spu_extract(mask, 2)) colorTile->ui[y][x*2+2] = c2; if (spu_extract(mask, 3)) - colorTile->ui[y][x*2+3] = c3; + colorTile->ui[y][x*2+3] = c3; #endif } -- cgit v1.2.3 From 283ffdf99605c536d00e03ad6ec91a6f8e006fc2 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:13:20 -0600 Subject: cell: checkpoint: remove more of the old per-fragment code --- src/gallium/drivers/cell/common.h | 2 + src/gallium/drivers/cell/ppu/Makefile | 1 - src/gallium/drivers/cell/ppu/cell_state_emit.c | 60 ++----------- src/gallium/drivers/cell/spu/spu_main.c | 115 +++---------------------- src/gallium/drivers/cell/spu/spu_main.h | 37 +------- 5 files changed, 19 insertions(+), 196 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index a62530c64d..61d2b7d1ae 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -146,6 +146,8 @@ struct cell_command_logicop struct cell_command_fragment_ops { uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ + struct pipe_depth_stencil_alpha_state dsa; + struct pipe_blend_state blend; unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS]; }; diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index b5a6fcb8de..8699f3f8ec 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -28,7 +28,6 @@ SOURCES = \ cell_gen_fragment.c \ cell_state_derived.c \ cell_state_emit.c \ - cell_state_per_fragment.c \ cell_state_shader.c \ cell_pipe_state.c \ cell_screen.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 06777aac14..2bfb976c59 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -55,23 +55,6 @@ emit_state_cmd(struct cell_context *cell, uint cmd, void cell_emit_state(struct cell_context *cell) { - if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) { - struct cell_command_logicop logicop; - - if (cell->logic_op.store != NULL) { - spe_release_func(& cell->logic_op); - } - - cell_generate_logic_op(& cell->logic_op, - & cell->blend->base, - cell->framebuffer.cbufs[0]); - - logicop.base = (intptr_t) cell->logic_op.store; - logicop.size = 64 * 4; - emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop, - sizeof(logicop)); - } - if (cell->dirty & CELL_NEW_FRAMEBUFFER) { struct pipe_surface *cbuf = cell->framebuffer.cbufs[0]; struct pipe_surface *zbuf = cell->framebuffer.zsbuf; @@ -91,7 +74,9 @@ cell_emit_state(struct cell_context *cell) } - if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) { + if (cell->dirty & (CELL_NEW_FRAMEBUFFER | + CELL_NEW_DEPTH_STENCIL | + CELL_NEW_BLEND)) { /* XXX we don't want to always do codegen here. We should have * a hash/lookup table to cache previous results... */ @@ -105,47 +90,12 @@ cell_emit_state(struct cell_context *cell) fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; memcpy(&fops->code, spe_code.store, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + fops->dsa = cell->depth_stencil->base; + fops->blend = cell->blend->base; /* free codegen buffer */ spe_release_func(&spe_code); } - if (cell->dirty & CELL_NEW_BLEND) { - struct cell_command_blend blend; - - if (cell->blend != NULL) { - blend.base = (intptr_t) cell->blend->code.store; - blend.size = cell->blend->code.num_inst * SPE_INST_SIZE; - blend.read_fb = TRUE; - } - else { - blend.base = 0; - blend.size = 0; - blend.read_fb = FALSE; - } - - emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend)); - } - - if (cell->dirty & CELL_NEW_DEPTH_STENCIL) { - struct cell_command_depth_stencil_alpha_test dsat; - - if (cell->depth_stencil != NULL) { - dsat.base = (intptr_t) cell->depth_stencil->code.store; - dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE; - dsat.read_depth = TRUE; - dsat.read_stencil = FALSE; - dsat.state = cell->depth_stencil->base; - } - else { - dsat.base = 0; - dsat.size = 0; - dsat.read_depth = FALSE; - dsat.read_stencil = FALSE; - } - - emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat, sizeof(dsat)); - } - if (cell->dirty & CELL_NEW_SAMPLER) { uint i; for (i = 0; i < CELL_MAX_SAMPLERS; i++) { diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 4e0ec15925..6afca19dfd 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -63,14 +63,6 @@ struct spu_vs_context draw; static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS] ALIGN16_ATTRIB; -static unsigned char depth_stencil_code_buffer[4 * 64] - ALIGN16_ATTRIB; - -static unsigned char fb_blend_code_buffer[4 * 64] - ALIGN16_ATTRIB; - -static unsigned char logicop_code_buffer[4 * 64] - ALIGN16_ATTRIB; /** @@ -240,8 +232,15 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + /* Copy state info */ + memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); + memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); + /* Point function pointer at new code */ spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code; + + spu.read_depth = spu.depth_stencil_alpha.depth.enabled; + spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; } @@ -303,89 +302,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) } -#define NEW_FRAGMENT_FUNCTION 01 - -static void -cmd_state_blend(const struct cell_command_blend *state) -{ - if (Debug) - printf("SPU %u: BLEND: enabled %d\n", - spu.init.id, - (state->size != 0)); - - ASSERT_ALIGN16(state->base); - - if (state->size != 0) { - mfc_get(fb_blend_code_buffer, - (unsigned int) state->base, /* src */ - ROUNDUP16(state->size), - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - spu.blend = (blend_func) fb_blend_code_buffer; - spu.read_fb = state->read_fb; - } - else - { - spu.read_fb = FALSE; - } -} - - -static void -cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state) -{ - if (Debug) - printf("SPU %u: DEPTH_STENCIL: ztest %d\n", - spu.init.id, - state->read_depth); - - ASSERT_ALIGN16(state->base); - - if (state->size != 0) { - mfc_get(depth_stencil_code_buffer, - (unsigned int) state->base, /* src */ - ROUNDUP16(state->size), - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - } - else - { - /* If there is no code, emit a return instruction. - */ - depth_stencil_code_buffer[0] = 0x35; - depth_stencil_code_buffer[1] = 0x00; - depth_stencil_code_buffer[2] = 0x00; - depth_stencil_code_buffer[3] = 0x00; - } - - spu.frag_test = (frag_test_func) depth_stencil_code_buffer; - spu.read_depth = state->read_depth; - spu.read_stencil = state->read_stencil; - spu.depth_stencil_alpha = state->state; -} - - -static void -cmd_state_logicop(const struct cell_command_logicop * code) -{ -#if !NEW_FRAGMENT_FUNCTION - mfc_get(logicop_code_buffer, - (unsigned int) code->base, /* src */ - code->size, - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - - spu.logicop = (logicop_func) logicop_code_buffer; -#endif -} - - static void cmd_state_sampler(const struct cell_command_sampler *sampler) { @@ -571,15 +487,6 @@ cmd_batch(uint opcode) cmd_finish(); pos += 1; break; - case CELL_CMD_STATE_BLEND: - cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8); - break; - case CELL_CMD_STATE_DEPTH_STENCIL: - cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *) - &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8); - break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler @@ -614,19 +521,17 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); break; case CELL_CMD_STATE_BIND_VS: +#if 01 spu_bind_vertex_shader(&draw, (struct cell_shader_info *) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); +#endif break; case CELL_CMD_STATE_ATTRIB_FETCH: cmd_state_attrib_fetch((struct cell_attribute_fetch_code *) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); break; - case CELL_CMD_STATE_LOGICOP: - cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8); - break; case CELL_CMD_FLUSH_BUFFER_RANGE: { struct cell_buffer_range *br = (struct cell_buffer_range *) &buffer[pos+1]; @@ -695,7 +600,9 @@ main_loop(void) exitFlag = 1; break; case CELL_CMD_VS_EXECUTE: +#if 01 spu_execute_vertex_shader(&draw, &cmd.vs); +#endif break; case CELL_CMD_BATCH: cmd_batch(opcode); diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 7ab34f5222..f0f8be47db 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -60,35 +60,6 @@ typedef union { #define TILE_STATUS_GETTING 5 /**< mfc_get() called but not yet arrived */ -struct spu_frag_test_results { - qword mask; - qword depth; - qword stencil; -}; - -typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask, - qword pixel_depth, qword pixel_stencil, qword frag_depth, - qword frag_alpha, qword facing); - - -struct spu_blend_results { - qword r; - qword g; - qword b; - qword a; -}; - -typedef struct spu_blend_results (*blend_func)( - qword frag_r, qword frag_g, qword frag_b, qword frag_a, - qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, - qword const_r, qword const_g, qword const_b, qword const_a); - -typedef struct spu_blend_results (*logicop_func)( - qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, - qword frag_r, qword frag_g, qword frag_b, qword frag_a, - qword frag_mask); - - typedef vector float (*sample_texture_func)(uint unit, vector float texcoord); @@ -147,16 +118,10 @@ struct spu_global struct spu_framebuffer fb; struct pipe_depth_stencil_alpha_state depth_stencil_alpha; + struct pipe_blend_state blend; boolean read_depth; boolean read_stencil; - frag_test_func frag_test; /**< Current depth/stencil test code */ - - boolean read_fb; /**< Does current blend mode require framebuffer read? */ - blend_func blend; /**< Current blend code */ - qword const_blend_color[4] ALIGN16_ATTRIB; - - logicop_func logicop; /**< Current logicop code **/ struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; struct spu_texture texture[PIPE_MAX_SAMPLERS]; -- cgit v1.2.3 From aa4a08d429712fa516342ec02253c2591794ea5f Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:25:38 -0600 Subject: cell: asst. clean-up --- src/gallium/drivers/cell/spu/spu_main.c | 23 +++++----------- src/gallium/drivers/cell/spu/spu_main.h | 47 +++++++++++++++------------------ src/gallium/drivers/cell/spu/spu_tri.c | 10 +++---- 3 files changed, 32 insertions(+), 48 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 6afca19dfd..29686964d2 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -231,13 +231,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) if (Debug) printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); /* Copy SPU code from batch buffer to spu buffer */ - memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); /* Copy state info */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); /* Point function pointer at new code */ - spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code; + spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; spu.read_depth = spu.depth_stencil_alpha.depth.enabled; spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; @@ -288,17 +288,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) spu.fb.zsize = 0; break; } - - if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM) - spu.color_shuffle = ((vector unsigned char) { - 12, 0, 4, 8, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}); - else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM) - spu.color_shuffle = ((vector unsigned char) { - 8, 4, 0, 12, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}); - else - ASSERT(0); } @@ -521,11 +510,11 @@ cmd_batch(uint opcode) pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); break; case CELL_CMD_STATE_BIND_VS: -#if 01 +#if 0 spu_bind_vertex_shader(&draw, (struct cell_shader_info *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); #endif + pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); break; case CELL_CMD_STATE_ATTRIB_FETCH: cmd_state_attrib_fetch((struct cell_attribute_fetch_code *) @@ -600,7 +589,7 @@ main_loop(void) exitFlag = 1; break; case CELL_CMD_VS_EXECUTE: -#if 01 +#if 0 spu_execute_vertex_shader(&draw, &cmd.vs); #endif break; @@ -631,7 +620,7 @@ one_time_init(void) /* Install default/fallback fragment processing function. * This will normally be overriden by a code-gen'd function. */ - spu.fragment_ops.func = spu_fallback_fragment_ops; + spu.fragment_ops = spu_fallback_fragment_ops; } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index f0f8be47db..d40539da83 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -60,9 +60,11 @@ typedef union { #define TILE_STATUS_GETTING 5 /**< mfc_get() called but not yet arrived */ -typedef vector float (*sample_texture_func)(uint unit, vector float texcoord); - +/** Function for sampling textures */ +typedef vector float (*spu_sample_texture_func)(uint unit, + vector float texcoord); +/** Function for performing per-fragment ops */ typedef void (*spu_fragment_ops_func)(uint x, uint y, tile_t *colorTile, tile_t *depthStencilTile, @@ -73,14 +75,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y, vector float fragAlpha, vector unsigned int mask); -struct spu_fragment_ops +struct spu_framebuffer { - uint code[SPU_MAX_FRAGMENT_OPS_INSTS]; - spu_fragment_ops_func func; /**< Current fragment ops function */ -} ALIGN16_ATTRIB; - - -struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ void *depth_start; /**< addr of depth surface in main memory */ enum pipe_format color_format; @@ -109,34 +105,31 @@ struct spu_texture /** - * All SPU global/context state will be in singleton object of this type: + * All SPU global/context state will be in a singleton object of this type: */ struct spu_global { + /** One-time init/constant info */ struct cell_init_info init; + /* + * Current state + */ struct spu_framebuffer fb; - struct pipe_depth_stencil_alpha_state depth_stencil_alpha; struct pipe_blend_state blend; - - boolean read_depth; - boolean read_stencil; - struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; struct spu_texture texture[PIPE_MAX_SAMPLERS]; - struct vertex_info vertex_info; - struct spu_fragment_ops fragment_ops; - - /* XXX more state to come */ - - - /** current color and Z tiles */ + /** Current color and Z tiles */ tile_t ctile ALIGN16_ATTRIB; tile_t ztile ALIGN16_ATTRIB; + /** Read depth/stencil tiles? */ + boolean read_depth; + boolean read_stencil; + /** Current tiles' status */ ubyte cur_ctile_status, cur_ztile_status; @@ -144,11 +137,13 @@ struct spu_global ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + /** Current fragment ops machine code */ + uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS]; + /** Current fragment ops function */ + spu_fragment_ops_func fragment_ops; - /** for converting RGBA to PIPE_FORMAT_x colors */ - vector unsigned char color_shuffle; - - sample_texture_func sample_texture[CELL_MAX_SAMPLERS]; + /** Current texture sampler function */ + spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS]; } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index a5bf3270c7..f02cdd1f76 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -333,11 +333,11 @@ emit_quad( int x, int y, mask_t mask ) /* Do all per-fragment/quad operations here, including: * alpha test, z test, stencil test, blend and framebuffer writing. */ - spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile, - fragZ.v, - soa_frag[0], soa_frag[1], - soa_frag[2], soa_frag[3], - mask); + spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile, + fragZ.v, + soa_frag[0], soa_frag[1], + soa_frag[2], soa_frag[3], + mask); } } -- cgit v1.2.3 From f19903aa83e9b6e18930cbda14cfec3cca2e1bf2 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:26:00 -0600 Subject: cell: remove old blend/depth/stencil/logicop structs --- src/gallium/drivers/cell/common.h | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 61d2b7d1ae..8aa2b23ec0 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -111,35 +111,6 @@ #define CELL_DEBUG_SYNC (1 << 1) -/** - */ -struct cell_command_depth_stencil_alpha_test -{ - uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of SPE code. */ - unsigned read_depth; /**< Flag: should depth be read? */ - unsigned read_stencil; /**< Flag: should stencil be read? */ - struct pipe_depth_stencil_alpha_state state; -}; - - -/** - * Upload code to perform framebuffer blend operation - */ -struct cell_command_blend -{ - uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of SPE code. */ - unsigned read_fb; /**< Flag: should framebuffer be read? */ -}; - - -struct cell_command_logicop -{ - uint64_t base; /**< Effective address of code start. */ - unsigned size; /**< Size in bytes of SPE code. */ -}; - #define SPU_MAX_FRAGMENT_OPS_INSTS 64 -- cgit v1.2.3 From 924653e37db4501d0f03721e9d74abffe46a3c72 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:27:17 -0600 Subject: cell: don't build unused sources --- src/gallium/drivers/cell/spu/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index e285ae9fdb..1ae0dfb8c1 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -22,12 +22,15 @@ SOURCES = \ spu_render.c \ spu_texture.c \ spu_tile.c \ - spu_tri.c \ + spu_tri.c + +OLD_SOURCES = \ spu_exec.c \ spu_util.c \ spu_vertex_fetch.c \ spu_vertex_shader.c + SPU_OBJECTS = $(SOURCES:.c=.o) \ SPU_ASM_OUT = $(SOURCES:.c=.s) \ -- cgit v1.2.3 From a558369ec66e3d9e2b88f4df9a3b5a3704b19ef3 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:33:13 -0600 Subject: cell: disable NEW_VS emit --- src/gallium/drivers/cell/ppu/cell_state_emit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 2bfb976c59..180b89c1f6 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -133,7 +133,8 @@ cell_emit_state(struct cell_context *cell) emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO, &cell->vertex_info, sizeof(struct vertex_info)); } - + +#if 0 if (cell->dirty & CELL_NEW_VS) { const struct draw_context *const draw = cell->draw; struct cell_shader_info info; @@ -148,4 +149,5 @@ cell_emit_state(struct cell_context *cell) emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info)); } +#endif } -- cgit v1.2.3 From f6bf8d9d410d94372b72f4f6ede6196ae5a4a67f Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:33:24 -0600 Subject: cell: clean-up, comments --- src/gallium/drivers/cell/spu/spu_main.c | 52 ++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 20 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 29686964d2..2a7cb75f59 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -429,16 +429,14 @@ cmd_batch(uint opcode) printf("SPU %u: release batch buf %u\n", spu.init.id, buf); release_buffer(buf); + /* + * Loop over commands in the batch buffer + */ for (pos = 0; pos < usize; /* no incr */) { switch (buffer[pos]) { - case CELL_CMD_STATE_FRAMEBUFFER: - { - struct cell_command_framebuffer *fb - = (struct cell_command_framebuffer *) &buffer[pos]; - cmd_state_framebuffer(fb); - pos += sizeof(*fb) / 8; - } - break; + /* + * rendering commands + */ case CELL_CMD_CLEAR_SURFACE: { struct cell_command_clear_surface *clr @@ -456,6 +454,17 @@ cmd_batch(uint opcode) pos += pos_incr; } break; + /* + * state-update commands + */ + case CELL_CMD_STATE_FRAMEBUFFER: + { + struct cell_command_framebuffer *fb + = (struct cell_command_framebuffer *) &buffer[pos]; + cmd_state_framebuffer(fb); + pos += sizeof(*fb) / 8; + } + break; case CELL_CMD_STATE_FRAGMENT_OPS: { struct cell_command_fragment_ops *fops @@ -464,18 +473,6 @@ cmd_batch(uint opcode) pos += sizeof(*fops) / 8; } break; - case CELL_CMD_RELEASE_VERTS: - { - struct cell_command_release_verts *release - = (struct cell_command_release_verts *) &buffer[pos]; - cmd_release_verts(release); - pos += sizeof(*release) / 8; - } - break; - case CELL_CMD_FINISH: - cmd_finish(); - pos += 1; - break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler @@ -521,6 +518,21 @@ cmd_batch(uint opcode) &buffer[pos+1]); pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); break; + /* + * misc commands + */ + case CELL_CMD_FINISH: + cmd_finish(); + pos += 1; + break; + case CELL_CMD_RELEASE_VERTS: + { + struct cell_command_release_verts *release + = (struct cell_command_release_verts *) &buffer[pos]; + cmd_release_verts(release); + pos += sizeof(*release) / 8; + } + break; case CELL_CMD_FLUSH_BUFFER_RANGE: { struct cell_buffer_range *br = (struct cell_buffer_range *) &buffer[pos+1]; -- cgit v1.2.3 From 73c6ae98c1c60635883a733f36d59d246e74aa2a Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:38:37 -0600 Subject: cell: remove old state CMDs, added comments --- src/gallium/drivers/cell/common.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 8aa2b23ec0..e989d8c2e5 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -84,7 +84,7 @@ #define CELL_CMD_BATCH 5 #define CELL_CMD_RELEASE_VERTS 6 #define CELL_CMD_STATE_FRAMEBUFFER 10 -#define CELL_CMD_STATE_DEPTH_STENCIL 11 +#define CELL_CMD_STATE_FRAGMENT_OPS 11 #define CELL_CMD_STATE_SAMPLER 12 #define CELL_CMD_STATE_TEXTURE 13 #define CELL_CMD_STATE_VERTEX_INFO 14 @@ -92,12 +92,9 @@ #define CELL_CMD_STATE_UNIFORMS 16 #define CELL_CMD_STATE_VS_ARRAY_INFO 17 #define CELL_CMD_STATE_BIND_VS 18 -#define CELL_CMD_STATE_BLEND 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 -#define CELL_CMD_STATE_LOGICOP 21 #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 -#define CELL_CMD_STATE_FRAGMENT_OPS 24 #define CELL_NUM_BUFFERS 4 @@ -112,8 +109,13 @@ +/** Max instructions for doing per-fragment operations */ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 + +/** + * Command to specify per-fragment operations state and generated code. + */ struct cell_command_fragment_ops { uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ @@ -159,13 +161,15 @@ struct cell_array_info }; -struct cell_attribute_fetch_code { +struct cell_attribute_fetch_code +{ uint64_t base; uint size; }; -struct cell_buffer_range { +struct cell_buffer_range +{ uint64_t base; unsigned size; }; -- cgit v1.2.3 From 1b5331d7ebcf7b1a1693972cf13407184cab1e48 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:38:55 -0600 Subject: cell: fix typos in blend code-gen --- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 7966c0916c..79a82ef72b 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -286,16 +286,16 @@ gen_blend(const struct pipe_blend_state *blend, spe_roti(f, mask_reg, mask_reg, 8); /* fbG = fbRGBA & mask */ - spe_and(f, fbB_reg, fbRGBA_reg, mask_reg); + spe_and(f, fbG_reg, fbRGBA_reg, mask_reg); /* fbG = fbG >> 8 */ - spe_roti(f, fbB_reg, fbB_reg, -8); + spe_roti(f, fbG_reg, fbG_reg, -8); /* mask = mask << 8 */ spe_roti(f, mask_reg, mask_reg, 8); /* fbR = fbRGBA & mask */ spe_and(f, fbR_reg, fbRGBA_reg, mask_reg); /* fbR = fbR >> 16 */ - spe_roti(f, fbB_reg, fbB_reg, -16); + spe_roti(f, fbR_reg, fbR_reg, -16); /* mask = mask << 8 */ spe_roti(f, mask_reg, mask_reg, 8); -- cgit v1.2.3 From 7ce1d0fb6700fd4998a095de2c9edf5ed920464c Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 10:52:03 -0600 Subject: cell: more comments, stub code for colormask/logicop/etc --- src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 107 ++++++++++++++------- 1 file changed, 70 insertions(+), 37 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index 3f0eabaa05..03dd547845 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -37,6 +37,8 @@ #include "spu_per_fragment_op.h" +#define LINEAR_QUAD_LAYOUT 1 + /** * Called by rasterizer for each quad after the shader has run. Do @@ -177,27 +179,28 @@ spu_fallback_fragment_ops(uint x, uint y, } if (spu.blend.blend_enable) { + /* blending terms, misc regs */ vector float term1r, term1g, term1b, term1a; vector float term2r, term2g, term2b, term2a; - - vector float fbRGBA[4]; - vector float one, tmp; - /* get colors from framebuffer */ + vector float fbRGBA[4]; /* current framebuffer colors */ + + /* get colors from framebuffer/tile */ { vector float fc[4]; uint c0, c1, c2, c3; -#if 0 - c0 = colorTile->ui[y+0][x+0]; - c1 = colorTile->ui[y+0][x+1]; - c2 = colorTile->ui[y+1][x+0]; - c3 = colorTile->ui[y+1][x+1]; -#else + +#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */ c0 = colorTile->ui[y][x*2+0]; c1 = colorTile->ui[y][x*2+1]; c2 = colorTile->ui[y][x*2+2]; c3 = colorTile->ui[y][x*2+3]; +#else + c0 = colorTile->ui[y+0][x+0]; + c1 = colorTile->ui[y+0][x+1]; + c2 = colorTile->ui[y+1][x+0]; + c3 = colorTile->ui[y+1][x+1]; #endif switch (spu.fb.color_format) { case PIPE_FORMAT_B8G8R8A8_UNORM: @@ -360,18 +363,11 @@ spu_fallback_fragment_ops(uint x, uint y, } - /* XXX do colormask test here */ - - - if (spu_extract(spu_orx(mask), 0)) { - spu.cur_ctile_status = TILE_STATUS_DIRTY; - } - else { - return; - } - - /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */ + /* + * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA. + */ #if 0 + /* original code */ { vector float frag_soa[4]; frag_soa[0] = fragR; @@ -387,6 +383,9 @@ spu_fallback_fragment_ops(uint x, uint y, (void) fragB; #endif + /* + * Pack float colors into 32-bit RGBA words. + */ switch (spu.fb.color_format) { case PIPE_FORMAT_A8R8G8B8_UNORM: c0 = spu_pack_A8R8G8B8(frag_aos[0]); @@ -406,24 +405,41 @@ spu_fallback_fragment_ops(uint x, uint y, ASSERT(0); } -#if 0 + /* - * Quad layout: - * +--+--+ - * |p0|p1| - * +--+--+ - * |p2|p3| - * +--+--+ + * Color masking */ - if (spu_extract(mask, 0)) - colorTile->ui[y+0][x+0] = c0; - if (spu_extract(mask, 1)) - colorTile->ui[y+0][x+1] = c1; - if (spu_extract(mask, 2)) - colorTile->ui[y+1][x+0] = c2; - if (spu_extract(mask, 3)) - colorTile->ui[y+1][x+1] = c3; -#else + if (spu.blend.colormask != 0xf) { + /* XXX to do */ + /* apply color mask to 32-bit packed colors */ + } + + + /* + * Logic Ops + */ + if (spu.blend.logicop_enable) { + /* XXX to do */ + /* apply logicop to 32-bit packed colors */ + } + + + /* + * If mask is non-zero, mark tile as dirty. + */ + if (spu_extract(spu_orx(mask), 0)) { + spu.cur_ctile_status = TILE_STATUS_DIRTY; + } + else { + return; + } + + + /* + * Write new quad colors to the framebuffer/tile. + * Only write pixels where the corresponding mask word is set. + */ +#if LINEAR_QUAD_LAYOUT /* * Quad layout: * +--+--+--+--+ @@ -438,5 +454,22 @@ spu_fallback_fragment_ops(uint x, uint y, colorTile->ui[y][x*2+2] = c2; if (spu_extract(mask, 3)) colorTile->ui[y][x*2+3] = c3; +#else + /* + * Quad layout: + * +--+--+ + * |p0|p1| + * +--+--+ + * |p2|p3| + * +--+--+ + */ + if (spu_extract(mask, 0)) + colorTile->ui[y+0][x+0] = c0; + if (spu_extract(mask, 1)) + colorTile->ui[y+0][x+1] = c1; + if (spu_extract(mask, 2)) + colorTile->ui[y+1][x+0] = c2; + if (spu_extract(mask, 3)) + colorTile->ui[y+1][x+1] = c3; #endif } -- cgit v1.2.3 From fbf1586b36f8fb181ecee6a285c94f11e30005ba Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 12:01:19 -0600 Subject: gallium: typo: s/PIPE_FORMAT_Z16UNORM/PIPE_FORMAT_Z16_UNORM/ --- src/gallium/winsys/xlib/xm_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/gallium') diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c index 28bd6ceab4..d28a6423b9 100644 --- a/src/gallium/winsys/xlib/xm_api.c +++ b/src/gallium/winsys/xlib/xm_api.c @@ -354,7 +354,7 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type, depthFormat = PIPE_FORMAT_S8Z24_UNORM; #else else if (vis->mesa_visual.depthBits <= 16) - depthFormat = PIPE_FORMAT_Z16UNORM; + depthFormat = PIPE_FORMAT_Z16_UNORM; else if (vis->mesa_visual.depthBits <= 24) depthFormat = PIPE_FORMAT_S8Z24_UNORM; else -- cgit v1.2.3 From be925ab6e8ecf6758adb2c6f2c423af31c5f86ca Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 15:48:13 -0600 Subject: cell: put cell_ prefix on gen_fragment_function() --- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 4 ++-- src/gallium/drivers/cell/ppu/cell_gen_fragment.h | 2 +- src/gallium/drivers/cell/ppu/cell_state_emit.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 79a82ef72b..5622701dda 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -616,7 +616,7 @@ gen_pack_colors(struct spe_function *f, * \param f the generated function (out) */ void -gen_fragment_function(struct cell_context *cell, struct spe_function *f) +cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) { const struct pipe_depth_stencil_alpha_state *dsa = &cell->depth_stencil->base; @@ -850,7 +850,7 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_release_register(f, rgba_reg); } - printf("gen_fragment_ops nr instructions: %u\n", f->num_inst); + //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst); spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */ diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h index 0ea0fc690c..b59de198dc 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h @@ -31,7 +31,7 @@ extern void -gen_fragment_function(struct cell_context *cell, struct spe_function *f); +cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f); #endif /* CELL_GEN_FRAGMENT_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 180b89c1f6..3ebf0749ad 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -85,7 +85,7 @@ cell_emit_state(struct cell_context *cell) struct spe_function spe_code; /* generate new code */ - gen_fragment_function(cell, &spe_code); + cell_gen_fragment_function(cell, &spe_code); /* put the new code into the batch buffer */ fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; memcpy(&fops->code, spe_code.store, -- cgit v1.2.3 From 178bbaff80d079606a1135bd65f1a85bac9774c4 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 17:07:30 -0600 Subject: gallium: add special cases in spe_load_float(), spe_load_int(), added spe_splat() --- src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 45 +++++++++++++++++++++++------ src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 4 +++ 2 files changed, 40 insertions(+), 9 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index 61010e4333..a04cc6c4ff 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -473,21 +473,48 @@ EMIT_R (spe_mtspr, 0x10c); void spe_load_float(struct spe_function *p, unsigned rT, float x) { - union { - float f; - unsigned u; - } bits; - bits.f = x; - spe_ilhu(p, rT, bits.u >> 16); - spe_iohl(p, rT, bits.u & 0xffff); + if (x == 0.0f) { + spe_il(p, rT, 0x0); + } + else if (x == 0.5f) { + spe_ilhu(p, rT, 0x3f00); + } + else if (x == 1.0f) { + spe_ilhu(p, rT, 0x3f80); + } + else if (x == -1.0f) { + spe_ilhu(p, rT, 0xbf80); + } + else { + union { + float f; + unsigned u; + } bits; + bits.f = x; + spe_ilhu(p, rT, bits.u >> 16); + spe_iohl(p, rT, bits.u & 0xffff); + } } void spe_load_int(struct spe_function *p, unsigned rT, int i) { - spe_ilhu(p, rT, i >> 16); - spe_iohl(p, rT, i & 0xffff); + if (-32768 <= i && i <= 32767) { + spe_il(p, rT, i); + } + else { + spe_ilhu(p, rT, i >> 16); + spe_iohl(p, rT, i & 0xffff); + } +} + + +void +spe_splat(struct spe_function *p, unsigned rT, unsigned rA) +{ + spe_ila(p, rT, 66051); + spe_shufb(p, rT, rA, rA, rT); } diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index dee8c55c4a..d95e5aace3 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -292,6 +292,10 @@ spe_load_float(struct spe_function *p, unsigned rT, float x); extern void spe_load_int(struct spe_function *p, unsigned rT, int i); +/** Replicate word 0 of rA across rT. */ +extern void +spe_splat(struct spe_function *p, unsigned rT, unsigned rA); + /** Complement/invert all bits in rT. */ extern void spe_complement(struct spe_function *p, unsigned rT); -- cgit v1.2.3 From bc304bbd49d15ce1130f3ba07adaa85ef03ed931 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 17:08:52 -0600 Subject: cell: minor improvements to fragment code-gen --- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 5622701dda..06219d4e98 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -265,6 +265,8 @@ gen_blend(const struct pipe_blend_state *blend, int one_reg = spe_allocate_available_register(f); int tmp_reg = spe_allocate_available_register(f); + boolean one_reg_set = false; /* avoid setting one_reg more than once */ + ASSERT(blend->blend_enable); /* Unpack/convert framebuffer colors from four 32-bit packed colors @@ -275,7 +277,7 @@ gen_blend(const struct pipe_blend_state *blend, int mask_reg = spe_allocate_available_register(f); /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */ - spe_fsmbi(f, mask_reg, 0x1111); + spe_load_int(f, mask_reg, 0xff); /* XXX there may be more clever ways to implement the following code */ switch (color_format) { @@ -418,7 +420,10 @@ gen_blend(const struct pipe_blend_state *blend, break; case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* one = {1.0, 1.0, 1.0, 1.0} */ - spe_load_float(f, one_reg, 1.0f); + if (!one_reg_set) { + spe_load_float(f, one_reg, 1.0f); + one_reg_set = true; + } /* tmp = one - fragA */ spe_fs(f, tmp_reg, one_reg, fragA_reg); /* term = fb * tmp */ @@ -446,7 +451,10 @@ gen_blend(const struct pipe_blend_state *blend, break; case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* one = {1.0, 1.0, 1.0, 1.0} */ - spe_load_float(f, one_reg, 1.0f); + if (!one_reg_set) { + spe_load_float(f, one_reg, 1.0f); + one_reg_set = true; + } /* tmp = one - fragA */ spe_fs(f, tmp_reg, one_reg, fragA_reg); /* termA = fbA * tmp */ -- cgit v1.2.3 From 084ab37b7f34d509af995efaef4615289669f72b Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 17:10:10 -0600 Subject: cell: fix tile twidding bug seen in the event of multiple expose events --- src/gallium/winsys/xlib/xm_winsys.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c index c4a30d3702..2acbc94fc8 100644 --- a/src/gallium/winsys/xlib/xm_winsys.c +++ b/src/gallium/winsys/xlib/xm_winsys.c @@ -289,21 +289,19 @@ xm_buffer_destroy(struct pipe_winsys *pws, * +--+--+ */ static void -twiddle_tile(uint *tile) +twiddle_tile(const uint *tileIn, uint *tileOut) { - uint tile2[TILE_SIZE * TILE_SIZE]; int y, x; for (y = 0; y < TILE_SIZE; y+=2) { for (x = 0; x < TILE_SIZE; x+=2) { int k = 4 * (y/2 * TILE_SIZE/2 + x/2); - tile2[y * TILE_SIZE + (x + 0)] = tile[k]; - tile2[y * TILE_SIZE + (x + 1)] = tile[k+1]; - tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2]; - tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3]; + tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k]; + tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1]; + tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2]; + tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3]; } } - memcpy(tile, tile2, sizeof(tile2)); } @@ -339,6 +337,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf) for (y = 0; y < surf->height; y += TILE_SIZE) { for (x = 0; x < surf->width; x += TILE_SIZE) { + uint tmpTile[TILE_SIZE * TILE_SIZE]; int tx = x / TILE_SIZE; int ty = y / TILE_SIZE; int offset = ty * tilesPerRow + tx; @@ -352,9 +351,9 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf) offset *= 4 * TILE_SIZE * TILE_SIZE; - ximage->data = (char *) xm_buf->data + offset; - - twiddle_tile((uint *) ximage->data); + twiddle_tile((uint *) ((char *) xm_buf->data + offset), + tmpTile); + ximage->data = (char*) tmpTile; if (XSHM_ENABLED(xm_buf)) { #if defined(USE_XSHM) && !defined(XFree86Server) -- cgit v1.2.3 From aa66f08a21b791f338b519f0c2162cd8f7b3aeb0 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 11 Sep 2008 17:59:52 -0600 Subject: cell: initial support for fragment shader code generation. TGSI shaders are translated into SPE instructions which are then sent to the SPEs for execution. Only a few opcodes work, no swizzling yet, no support for constants/immediates, etc. --- src/gallium/drivers/cell/common.h | 15 + src/gallium/drivers/cell/ppu/Makefile | 1 + src/gallium/drivers/cell/ppu/cell_context.h | 1 + src/gallium/drivers/cell/ppu/cell_gen_fp.c | 523 +++++++++++++++++++++++ src/gallium/drivers/cell/ppu/cell_gen_fp.h | 42 ++ src/gallium/drivers/cell/ppu/cell_state_emit.c | 16 + src/gallium/drivers/cell/ppu/cell_state_shader.c | 8 +- src/gallium/drivers/cell/spu/spu_main.c | 25 +- src/gallium/drivers/cell/spu/spu_main.h | 15 + src/gallium/drivers/cell/spu/spu_tri.c | 35 ++ 10 files changed, 678 insertions(+), 3 deletions(-) create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.c create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.h (limited to 'src/gallium') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index e989d8c2e5..cb0631baf5 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -92,6 +92,7 @@ #define CELL_CMD_STATE_UNIFORMS 16 #define CELL_CMD_STATE_VS_ARRAY_INFO 17 #define CELL_CMD_STATE_BIND_VS 18 +#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19 #define CELL_CMD_STATE_ATTRIB_FETCH 20 #define CELL_CMD_VS_EXECUTE 22 #define CELL_CMD_FLUSH_BUFFER_RANGE 23 @@ -125,6 +126,20 @@ struct cell_command_fragment_ops }; +/** Max instructions for fragment programs */ +#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128 + +/** + * Command to send a fragment progra to SPUs. + */ +struct cell_command_fragment_program +{ + uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */ + uint num_inst; /**< Number of instructions */ + unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; +}; + + /** * Tell SPUs about the framebuffer size, location */ diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index 8699f3f8ec..b28f4c5c31 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -26,6 +26,7 @@ SOURCES = \ cell_draw_arrays.c \ cell_flush.c \ cell_gen_fragment.c \ + cell_gen_fp.c \ cell_state_derived.c \ cell_state_emit.c \ cell_state_shader.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 8cec9f45b2..14914b9c6f 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -61,6 +61,7 @@ struct cell_fragment_shader_state { struct pipe_shader_state shader; struct tgsi_shader_info info; + struct spe_function code; void *data; }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c new file mode 100644 index 0000000000..6ffe94eb14 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -0,0 +1,523 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +/** + * Generate SPU fragment program/shader code. + * + * Note that we generate SOA-style code here. So each TGSI instruction + * operates on four pixels (and is translated into four SPU instructions, + * generally speaking). + * + * \author Brian Paul + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_exec.h" +#include "tgsi/tgsi_dump.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "util/u_memory.h" +#include "cell_context.h" +#include "cell_gen_fp.h" + + +/** Set to 1 to enable debug/disassembly printfs */ +#define DISASSEM 01 + + +/** + * Context needed during code generation. + */ +struct codegen +{ + int inputs_reg; /**< 1st function parameter */ + int outputs_reg; /**< 2nd function parameter */ + int constants_reg; /**< 3rd function parameter */ + int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */ + + int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */ + + /** Per-instruction temps / intermediate temps */ + int num_itemps; + int itemps[3]; + + struct spe_function *f; + boolean error; +}; + + +/** + * Allocate an intermediate temporary register. + */ +static int +get_itemp(struct codegen *gen) +{ + int t = spe_allocate_available_register(gen->f); + assert(gen->num_itemps < Elements(gen->itemps)); + gen->itemps[gen->num_itemps++] = t; + return t; +} + +/** + * Free all intermediate temporary registers. To be called after each + * instruction has been emitted. + */ +static void +free_itemps(struct codegen *gen) +{ + int i; + for (i = 0; i < gen->num_itemps; i++) { + spe_release_register(gen->f, gen->itemps[i]); + } + gen->num_itemps = 0; +} + + +/** + * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}. + * The register is allocated and initialized upon the first call. + */ +static int +get_const_one_reg(struct codegen *gen) +{ + if (gen->one_reg <= 0) { + gen->one_reg = spe_allocate_available_register(gen->f); + } + + /* one = {1.0, 1.0, 1.0, 1.0} */ + spe_load_float(gen->f, gen->one_reg, 1.0f); +#if DISASSEM + printf("il\tr%d, 1.0f\n", gen->one_reg); +#endif + + return gen->one_reg; +} + + +/** + * Return the index of the SPU temporary containing the named TGSI + * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we + * just return the corresponding SPE register. If the TGIS register + * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register + * and emit an SPE load instruction. + */ +static int +get_src_reg(struct codegen *gen, + int channel, + const struct tgsi_full_src_register *src) +{ + int reg; + + /* XXX need to examine src swizzle info here. + * That will involve changing the channel var... + */ + + + switch (src->SrcRegister.File) { + case TGSI_FILE_TEMPORARY: + reg = gen->temp_regs[src->SrcRegister.Index][channel]; + break; + case TGSI_FILE_INPUT: + { + /* offset is measured in quadwords, not bytes */ + int offset = src->SrcRegister.Index * 4 + channel; + reg = get_itemp(gen); + /* Load: reg = memory[(machine_reg) + offset] */ + spe_lqd(gen->f, reg, gen->inputs_reg, offset); +#if DISASSEM + printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset); +#endif + } + break; + case TGSI_FILE_IMMEDIATE: + /* xxx fall-through for now / fix */ + case TGSI_FILE_CONSTANT: + /* xxx fall-through for now / fix */ + default: + assert(0); + } + + return reg; +} + + +/** + * Return the index of an SPE register to use for the given TGSI register. + * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the + * corresponding SPE register is returned. If the TGSI register is + * TGSI_FILE_OUTPUT we allocate an intermediate temporary register. + * See store_dest_reg() below... + */ +static int +get_dst_reg(struct codegen *gen, + int channel, + const struct tgsi_full_dst_register *dest) +{ + int reg; + + switch (dest->DstRegister.File) { + case TGSI_FILE_TEMPORARY: + reg = gen->temp_regs[dest->DstRegister.Index][channel]; + break; + case TGSI_FILE_OUTPUT: + reg = get_itemp(gen); + break; + default: + assert(0); + } + + return reg; +} + + +/** + * When a TGSI instruction is writing to an output register, this + * function emits the SPE store instruction to store the value_reg. + * \param value_reg the SPE register containing the value to store. + * This would have been returned by get_dst_reg(). + */ +static void +store_dest_reg(struct codegen *gen, + int value_reg, int channel, + const struct tgsi_full_dst_register *dest) +{ + switch (dest->DstRegister.File) { + case TGSI_FILE_TEMPORARY: + /* no-op */ + break; + case TGSI_FILE_OUTPUT: + { + /* offset is measured in quadwords, not bytes */ + int offset = dest->DstRegister.Index * 4 + channel; + /* Store: memory[(machine_reg) + offset] = reg */ + spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); +#if DISASSEM + printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset); +#endif + } + break; + default: + assert(0); + } +} + + +static boolean +emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* XXX we don't always need to actually emit a mov instruction here */ + spe_move(gen->f, dst_reg, src_reg); +#if DISASSEM + printf("mov\tr%d, r%d\n", dst_reg, src_reg); +#endif + store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD + * becomes (up to) four SPU "fa" instructions because we're doing SOA + * processing. + */ +static boolean +emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + /* Loop over Red/Green/Blue/Alpha channels */ + for (ch = 0; ch < 4; ch++) { + /* If the dest R, G, B or A writemask is enabled... */ + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + /* get indexes of the two src, one dest SPE registers */ + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* Emit actual SPE instruction: d = s1 + s2 */ + spe_fa(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM + printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + + /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + /* Free any intermediate temps we allocated */ + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit multiply. See emit_ADD for comments. + */ +static boolean +emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = s1 * s2 */ + spe_fm(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM + printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit set-if-greater-than. + * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as + * the result but OpenGL/TGSI needs 0.0 and 1.0 results. + * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND. + */ +static boolean +emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 > s2) */ + spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM + printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); +#if DISASSEM + printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen)); +#endif + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + + +/** + * Emit END instruction. + * We just return from the shader function at this point. + * + * Note that there may be more code after this that would be + * called by TGSI_OPCODE_CALL. + */ +static boolean +emit_END(struct codegen *gen) +{ + /* return from function call */ + spe_bi(gen->f, SPE_REG_RA, 0, 0); +#if DISASSEM + printf("bi\trRA\n"); +#endif + return true; +} + + +/** + * Emit code for the given instruction. Just a big switch stmt. + */ +static boolean +emit_instruction(struct codegen *gen, + const struct tgsi_full_instruction *inst) +{ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_MOV: + return emit_MOV(gen, inst); + case TGSI_OPCODE_MUL: + return emit_MUL(gen, inst); + case TGSI_OPCODE_ADD: + return emit_ADD(gen, inst); + case TGSI_OPCODE_SGT: + return emit_SGT(gen, inst); + case TGSI_OPCODE_END: + return emit_END(gen); + + /* XXX lots more cases to do... */ + + default: + return false; + } + + return true; +} + + + +/** + * Emit "code" for a TGSI declaration. + * We only care about TGSI TEMPORARY register declarations at this time. + * For each TGSI TEMPORARY we allocate four SPE registers. + */ +static void +emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl) +{ + int i, ch; + + switch (decl->Declaration.File) { + case TGSI_FILE_TEMPORARY: +#if DISASSEM + printf("Declare temp reg %d .. %d\n", + decl->DeclarationRange.First, + decl->DeclarationRange.Last); +#endif + for (i = decl->DeclarationRange.First; + i <= decl->DeclarationRange.Last; + i++) { + for (ch = 0; ch < 4; ch++) { + gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); + } + + /* XXX if we run out of SPE registers, we need to spill + * to SPU memory. someday... + */ + +#if DISASSEM + printf(" SPE regs: %d %d %d %d\n", + gen->temp_regs[i][0], + gen->temp_regs[i][1], + gen->temp_regs[i][2], + gen->temp_regs[i][3]); +#endif + } + break; + default: + ; /* ignore */ + } +} + + +/** + * Translate TGSI shader code to SPE instructions. This is done when + * the state tracker gives us a new shader (via pipe->create_fs_state()). + * + * \param cell the rendering context (in) + * \param tokens the TGSI shader (in) + * \param f the generated function (out) + */ +boolean +cell_gen_fragment_program(struct cell_context *cell, + const struct tgsi_token *tokens, + struct spe_function *f) +{ + struct tgsi_parse_context parse; + struct codegen gen; + + memset(&gen, 0, sizeof(gen)); + gen.f = f; + + /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ + gen.inputs_reg = 3; /* pointer to inputs array */ + gen.outputs_reg = 4; /* pointer to outputs array */ + gen.constants_reg = 5; /* pointer to constants array */ + + spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); + spe_allocate_register(f, gen.inputs_reg); + spe_allocate_register(f, gen.outputs_reg); + spe_allocate_register(f, gen.constants_reg); + +#if DISASSEM + printf("Begin %s\n", __FUNCTION__); + tgsi_dump(tokens, 0); +#endif + + tgsi_parse_init(&parse, tokens); + + while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: +#if 0 + if (!note_immediate(&gen, &parse.FullToken.FullImmediate )) + goto fail; +#endif + break; + + case TGSI_TOKEN_TYPE_DECLARATION: + emit_declaration(&gen, &parse.FullToken.FullDeclaration); + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) { + gen.error = true; + } + break; + + default: + assert(0); + + } + } + + + if (gen.error) { + /* terminate the SPE code */ + return emit_END(&gen); + } + +#if DISASSEM + printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); + printf("End %s\n", __FUNCTION__); +#endif + + tgsi_parse_free( &parse ); + + return !gen.error; +} diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h new file mode 100644 index 0000000000..99faea7046 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h @@ -0,0 +1,42 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +#ifndef CELL_GEN_FP_H +#define CELL_GEN_FP_H + + + +extern boolean +cell_gen_fragment_program(struct cell_context *cell, + const struct tgsi_token *tokens, + struct spe_function *f); + + +#endif /* CELL_GEN_FP_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 3ebf0749ad..2da3097983 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -73,6 +73,22 @@ cell_emit_state(struct cell_context *cell) #endif } + if (cell->dirty & (CELL_NEW_FS)) { + /* Send new fragment program to SPUs */ + struct cell_command_fragment_program *fp + = cell_batch_alloc(cell, sizeof(*fp)); + fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM; + fp->num_inst = cell->fs->code.num_inst; + memcpy(&fp->code, cell->fs->code.store, + SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); + if (0) { + int i; + printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n"); + for (i = 0; i < fp->num_inst; i++) { + printf(" %3d: 0x%08x\n", i, fp->code[i]); + } + } + } if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL | diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c index 97e44eeb1a..3a0d066da2 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_shader.c +++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c @@ -34,7 +34,7 @@ #include "cell_context.h" #include "cell_state.h" - +#include "cell_gen_fp.h" /** cast wrapper */ @@ -61,7 +61,7 @@ static void * cell_create_fs_state(struct pipe_context *pipe, const struct pipe_shader_state *templ) { - /*struct cell_context *cell = cell_context(pipe);*/ + struct cell_context *cell = cell_context(pipe); struct cell_fragment_shader_state *cfs; cfs = CALLOC_STRUCT(cell_fragment_shader_state); @@ -76,6 +76,8 @@ cell_create_fs_state(struct pipe_context *pipe, tgsi_scan_shader(templ->tokens, &cfs->info); + cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code); + return cfs; } @@ -102,6 +104,8 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs) { struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs); + spe_release_func(&cfs->code); + FREE((void *) cfs->shader.tokens); FREE(cfs); } diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 2a7cb75f59..78260c4259 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -232,7 +232,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); - /* Copy state info */ + /* Copy state info (for fallback case only) */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); @@ -244,6 +244,21 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) } +static void +cmd_state_fragment_program(const struct cell_command_fragment_program *fp) +{ + if (Debug) + printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id); + /* Copy SPU code from batch buffer to spu buffer */ + memcpy(spu.fragment_program_code, fp->code, + SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); +#if 01 + /* Point function pointer at new code */ + spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code; +#endif +} + + static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { @@ -473,6 +488,14 @@ cmd_batch(uint opcode) pos += sizeof(*fops) / 8; } break; + case CELL_CMD_STATE_FRAGMENT_PROGRAM: + { + struct cell_command_fragment_program *fp + = (struct cell_command_fragment_program *) &buffer[pos]; + cmd_state_fragment_program(fp); + pos += sizeof(*fp) / 8; + } + break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index d40539da83..2c7b625840 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -75,6 +75,12 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y, vector float fragAlpha, vector unsigned int mask); +/** Function for running fragment program */ +typedef void (*spu_fragment_program_func)(vector float *inputs, + vector float *outputs, + vector float *constants); + + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ @@ -142,9 +148,18 @@ struct spu_global /** Current fragment ops function */ spu_fragment_ops_func fragment_ops; + /** Current fragment program machine code */ + uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; + /** Current fragment ops function */ + spu_fragment_program_func fragment_program; + /** Current texture sampler function */ spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS]; + /** Fragment program constants (XXX preliminary/used) */ +#define MAX_CONSTANTS 32 + vector float constants[MAX_CONSTANTS]; + } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index f02cdd1f76..8b93878192 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -314,7 +314,42 @@ emit_quad( int x, int y, mask_t mask ) } else { /* simple shading */ +#if 0 eval_coeff(1, (float) x, (float) y, colors); + +#else + /* XXX new fragment program code */ + + if (spu.fragment_program) { + vector float inputs[4*4], outputs[2*4]; + + /* setup inputs */ + eval_coeff(1, (float) x, (float) y, inputs); + + /* Execute the current fragment program */ + spu.fragment_program(inputs, outputs, spu.constants); + + /* Copy outputs */ + colors[0] = outputs[0*4+0]; + colors[1] = outputs[0*4+1]; + colors[2] = outputs[0*4+2]; + colors[3] = outputs[0*4+3]; + + if (0 && spu.init.id==0 && y == 48) { + printf("colors[0] = %f %f %f %f\n", + spu_extract(colors[0], 0), + spu_extract(colors[0], 1), + spu_extract(colors[0], 2), + spu_extract(colors[0], 3)); + printf("colors[1] = %f %f %f %f\n", + spu_extract(colors[1], 0), + spu_extract(colors[1], 1), + spu_extract(colors[1], 2), + spu_extract(colors[1], 3)); + } + + } +#endif } -- cgit v1.2.3