diff options
Diffstat (limited to 'src/gallium')
32 files changed, 2966 insertions, 568 deletions
diff --git a/src/gallium/Makefile b/src/gallium/Makefile index aa77021daf..36bd3623e7 100644 --- a/src/gallium/Makefile +++ b/src/gallium/Makefile @@ -3,6 +3,7 @@ include $(TOP)/configs/current SUBDIRS = auxiliary drivers +# Note winsys/ needs to be built after src/mesa default: subdirs diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index 44563803f9..79a19d6be2 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -46,6 +46,7 @@ struct exec_vertex_shader { struct draw_vertex_shader base; struct tgsi_exec_machine *machine; + const struct tgsi_token *machine_tokens; }; static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs ) @@ -62,12 +63,16 @@ vs_exec_prepare( struct draw_vertex_shader *shader, { struct exec_vertex_shader *evs = exec_vertex_shader(shader); - /* specify the vertex program to interpret/execute */ - tgsi_exec_machine_bind_shader(evs->machine, - shader->state.tokens, - PIPE_MAX_SAMPLERS, - NULL /*samplers*/ ); - + /* Specify the vertex program to interpret/execute. + * Avoid rebinding when possible. + */ + if (evs->machine_tokens != shader->state.tokens) { + tgsi_exec_machine_bind_shader(evs->machine, + shader->state.tokens, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/ ); + evs->machine_tokens = shader->state.tokens; + } } diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c new file mode 100644 index 0000000000..534a23568d --- /dev/null +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c @@ -0,0 +1,365 @@ +/************************************************************************** + * + * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * PPC code generation. + * \author Brian Paul + */ + + +#include "util/u_memory.h" +#include "pipe/p_debug.h" +#include "rtasm_ppc.h" + + +void +ppc_init_func(struct ppc_function *p, unsigned max_inst) +{ + p->store = align_malloc(max_inst * PPC_INST_SIZE, 16); + p->num_inst = 0; + p->max_inst = max_inst; + p->vec_used = ~0; +} + + +void +ppc_release_func(struct ppc_function *p) +{ + assert(p->num_inst <= p->max_inst); + if (p->store != NULL) { + align_free(p->store); + } + p->store = NULL; +} + + +/** + * Alloate a vector register. + * \return register index or -1 if none left. + */ +int +ppc_allocate_vec_register(struct ppc_function *p, int reg) +{ + unsigned i; + for (i = 0; i < PPC_NUM_VEC_REGS; i++) { + const uint64_t mask = 1 << i; + if ((p->vec_used & mask) != 0) { + p->vec_used &= ~mask; + return i; + } + } + + return -1; +} + + +/** + * Mark the given vector register as "unallocated". + */ +void +ppc_release_vec_register(struct ppc_function *p, int reg) +{ + assert(reg < PPC_NUM_VEC_REGS); + assert((p->vec_used & (1 << reg)) == 0); + + p->vec_used |= (1 << reg); +} + + + +union vx_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned vD:5; + unsigned vA:5; + unsigned vB:5; + unsigned op2:11; + } inst; +}; + +union vxr_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned vD:5; + unsigned vA:5; + unsigned vB:5; + unsigned rC:1; + unsigned op2:10; + } inst; +}; + +union va_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned vD:5; + unsigned vA:5; + unsigned vB:5; + unsigned vC:5; + unsigned op2:6; + } inst; +}; + + +static inline void +emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB) +{ + union vx_inst inst; + inst.inst.op = 4; + inst.inst.vD = vD; + inst.inst.vA = vA; + inst.inst.vB = vB; + inst.inst.op2 = op2; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + +static inline void +emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB) +{ + union vxr_inst inst; + inst.inst.op = 4; + inst.inst.vD = vD; + inst.inst.vA = vA; + inst.inst.vB = vB; + inst.inst.rC = 0; + inst.inst.op2 = op2; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + +static inline void +emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC) +{ + union va_inst inst; + inst.inst.op = 4; + inst.inst.vD = vD; + inst.inst.vA = vA; + inst.inst.vB = vB; + inst.inst.vC = vC; + inst.inst.op2 = op2; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + + + +/** + ** float vector arithmetic + **/ + +/** vector float add */ +void +ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB) +{ + emit_vx(p, 10, vD, vA, vB); +} + +/** vector float substract */ +void +ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 74, vD, vA, vB); +} + +/** vector float min */ +void +ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1098, vD, vA, vB); +} + +/** vector float max */ +void +ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1034, vD, vA, vB); +} + +/** vector float mult add */ +void +ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) +{ + emit_va(p, 46, vD, vA, vB, vC); +} + +/** vector float compare greater than */ +void +ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vxr(p, 710, vD, vA, vB); +} + +/** vector float compare greater than or equal to */ +void +ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vxr(p, 454, vD, vA, vB); +} + +/** vector float compare equal */ +void +ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vxr(p, 198, vD, vA, vB); +} + +/** vector float 2^x */ +void +ppc_vexptefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 394, vD, 0, vB); +} + +/** vector float log2(x) */ +void +ppc_vlogefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 458, vD, 0, vB); +} + +/** vector float reciprocol */ +void +ppc_vrefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 266, vD, 0, vB); +} + +/** vector float reciprocol sqrt estimate */ +void +ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 330, vD, 0, vB); +} + +/** vector float round to negative infinity */ +void +ppc_vrfim(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 714, vD, 0, vB); +} + +/** vector float round to positive infinity */ +void +ppc_vrfip(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 650, vD, 0, vB); +} + +/** vector float round to nearest int */ +void +ppc_vrfin(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 522, vD, 0, vB); +} + +/** vector float round to int toward zero */ +void +ppc_vrfiz(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 586, vD, 0, vB); +} + + + +/** + ** bitwise operations + **/ + + +/** vector and */ +void +ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1028, vD, vA, vB); +} + +/** vector and complement */ +void +ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1092, vD, vA, vB); +} + +/** vector or */ +void +ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1156, vD, vA, vB); +} + +/** vector nor */ +void +ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1284, vD, vA, vB); +} + +/** vector xor */ +void +ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1220, vD, vA, vB); +} + + +/** + ** Vector shuffle / select / splat / etc + **/ + +/** vector permute */ +void +ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) +{ + emit_va(p, 43, vD, vA, vB, vC); +} + +/** vector select */ +void +ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) +{ + emit_va(p, 42, vD, vA, vB, vC); +} + +/** vector splat byte */ +void +ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm) +{ + emit_vx(p, 42, vD, imm, vB); +} + +/** vector splat half word */ +void +ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm) +{ + emit_vx(p, 588, vD, imm, vB); +} + +/** vector splat word */ +void +ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm) +{ + emit_vx(p, 652, vD, imm, vB); +} diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h new file mode 100644 index 0000000000..ed14e943df --- /dev/null +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h @@ -0,0 +1,181 @@ +/************************************************************************** + * + * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * PPC code generation. + * \author Brian Paul + */ + + +#ifndef RTASM_PPC_H +#define RTASM_PPC_H + + +#include "pipe/p_compiler.h" + + +#define PPC_INST_SIZE 4 /**< 4 bytes / instruction */ + +#define PPC_NUM_VEC_REGS 32 + + +struct ppc_function +{ + uint32_t *store; /**< instruction buffer */ + uint num_inst; + uint max_inst; + uint32_t vec_used; /** used/free vector registers bitmask */ + uint32_t reg_used; /** used/free general-purpose registers bitmask */ +}; + + + +extern void ppc_init_func(struct ppc_function *p, unsigned max_inst); +extern void ppc_release_func(struct ppc_function *p); + +extern int ppc_allocate_vec_register(struct ppc_function *p, int reg); +extern void ppc_release_vec_register(struct ppc_function *p, int reg); + + +/** + ** float vector arithmetic + **/ + +/** vector float add */ +extern void +ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB); + +/** vector float substract */ +extern void +ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float min */ +extern void +ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float max */ +extern void +ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float mult add */ +extern void +ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); + +/** vector float compare greater than */ +extern void +ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float compare greater than or equal to */ +extern void +ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float compare equal */ +extern void +ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float 2^x */ +extern void +ppc_vexptefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float log2(x) */ +extern void +ppc_vlogefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float reciprocol */ +extern void +ppc_vrefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float reciprocol sqrt estimate */ +extern void +ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to negative infinity */ +extern void +ppc_vrfim(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to positive infinity */ +extern void +ppc_vrfip(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to nearest int */ +extern void +ppc_vrfin(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to int toward zero */ +extern void +ppc_vrfiz(struct ppc_function *p, uint vD, uint vB); + + + +/** + ** bitwise operations + **/ + + +/** vector and */ +extern void +ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector and complement */ +extern void +ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector or */ +extern void +ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector nor */ +extern void +ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector xor */ +extern void +ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB); + + +/** + ** Vector shuffle / select / splat / etc + **/ + +/** vector permute */ +extern void +ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); + +/** vector select */ +extern void +ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); + +/** vector splat byte */ +extern void +ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm); + +/** vector splat half word */ +extern void +ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm); + +/** vector splat word */ +extern void +ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm); + + +#endif /* RTASM_PPC_H */ diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index a04cc6c4ff..491141f190 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -27,12 +27,16 @@ * Real-time assembly generation interface for Cell B.E. SPEs. * * \author Ian Romanick <idr@us.ibm.com> + * \author Brian Paul */ + +#include <stdio.h> #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "rtasm_ppc_spe.h" + #ifdef GALLIUM_CELL /** * SPE instruction types @@ -143,8 +147,25 @@ union spe_inst_RI18 { /*@}*/ +static void +indent(const struct spe_function *p) +{ + int i; + for (i = 0; i < p->indent; i++) { + putchar(' '); + } +} + + +static const char * +rem_prefix(const char *longname) +{ + return longname + 4; +} + + static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, unsigned rB) + unsigned rA, unsigned rB, const char *name) { union spe_inst_RR inst; inst.inst.op = op; @@ -153,11 +174,15 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB); + } } static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, unsigned rB, unsigned rC) + unsigned rA, unsigned rB, unsigned rC, const char *name) { union spe_inst_RRR inst; inst.inst.op = op; @@ -167,11 +192,15 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rC = rC; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC); + } } static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, int imm) + unsigned rA, int imm, const char *name) { union spe_inst_RI7 inst; inst.inst.op = op; @@ -180,12 +209,16 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm); + } } static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, int imm) + unsigned rA, int imm, const char *name) { union spe_inst_RI8 inst; inst.inst.op = op; @@ -194,12 +227,16 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm); + } } static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, int imm) + unsigned rA, int imm, const char *name) { union spe_inst_RI10 inst; inst.inst.op = op; @@ -208,11 +245,19 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + if (strcmp(name, "spe_lqd") == 0 || + strcmp(name, "spe_stqd") == 0) + printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA); + else + printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm); + } } static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, - int imm) + int imm, const char *name) { union spe_inst_RI16 inst; inst.inst.op = op; @@ -220,11 +265,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm); + } } static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, - int imm) + int imm, const char *name) { union spe_inst_RI18 inst; inst.inst.op = op; @@ -232,6 +281,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm); + } } @@ -240,61 +293,61 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, #define EMIT_(_name, _op) \ void _name (struct spe_function *p, unsigned rT) \ { \ - emit_RR(p, _op, rT, 0, 0); \ + emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \ } #define EMIT_R(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA) \ { \ - emit_RR(p, _op, rT, rA, 0); \ + emit_RR(p, _op, rT, rA, 0, __FUNCTION__); \ } #define EMIT_RR(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \ { \ - emit_RR(p, _op, rT, rA, rB); \ + emit_RR(p, _op, rT, rA, rB, __FUNCTION__); \ } #define EMIT_RRR(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \ { \ - emit_RRR(p, _op, rT, rA, rB, rC); \ + emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__); \ } #define EMIT_RI7(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ { \ - emit_RI7(p, _op, rT, rA, imm); \ + emit_RI7(p, _op, rT, rA, imm, __FUNCTION__); \ } #define EMIT_RI8(_name, _op, bias) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ { \ - emit_RI8(p, _op, rT, rA, bias - imm); \ + emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__); \ } #define EMIT_RI10(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ { \ - emit_RI10(p, _op, rT, rA, imm); \ + emit_RI10(p, _op, rT, rA, imm, __FUNCTION__); \ } #define EMIT_RI16(_name, _op) \ void _name (struct spe_function *p, unsigned rT, int imm) \ { \ - emit_RI16(p, _op, rT, imm); \ + emit_RI16(p, _op, rT, imm, __FUNCTION__); \ } #define EMIT_RI18(_name, _op) \ void _name (struct spe_function *p, unsigned rT, int imm) \ { \ - emit_RI18(p, _op, rT, imm); \ + emit_RI18(p, _op, rT, imm, __FUNCTION__); \ } #define EMIT_I16(_name, _op) \ void _name (struct spe_function *p, int imm) \ { \ - emit_RI16(p, _op, 0, imm); \ + emit_RI16(p, _op, 0, imm, __FUNCTION__); \ } #include "rtasm_ppc_spe.h" @@ -314,6 +367,9 @@ void spe_init_func(struct spe_function *p, unsigned code_size) */ p->regs[0] = ~7; p->regs[1] = (1U << (80 - 64)) - 1; + + p->print = false; + p->indent = 0; } @@ -327,8 +383,15 @@ void spe_release_func(struct spe_function *p) } +/** Return current code size in bytes. */ +unsigned spe_code_size(const struct spe_function *p) +{ + return p->num_inst * SPE_INST_SIZE; +} + + /** - * Alloate a SPE register. + * Allocate a SPE register. * \return register index or -1 if none left. */ int spe_allocate_available_register(struct spe_function *p) @@ -382,6 +445,32 @@ void spe_release_register(struct spe_function *p, int reg) } +void +spe_print_code(struct spe_function *p, boolean enable) +{ + p->print = enable; +} + + +void +spe_indent(struct spe_function *p, int spaces) +{ + p->indent += spaces; +} + + +extern void +spe_comment(struct spe_function *p, int rel_indent, const char *s) +{ + if (p->print) { + p->indent += rel_indent; + indent(p); + p->indent -= rel_indent; + printf("# %s\n", s); + } +} + + /** * For branch instructions: * \param d if 1, disable interupts if branch is taken @@ -392,51 +481,51 @@ void spe_release_register(struct spe_function *p, int reg) /** Branch Indirect to address in rA */ void spe_bi(struct spe_function *p, unsigned rA, int d, int e) { - emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Interupt Return */ void spe_iret(struct spe_function *p, unsigned rA, int d, int e) { - emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect and set link on external data */ void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect and set link. Save PC in rT, jump to rA. */ void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if zero word. If rT.word[0]==0, jump to rA. */ void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if non-zero word. If rT.word[0]!=0, jump to rA. */ void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if zero halfword. If rT.halfword[1]==0, jump to rA. */ void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if non-zero halfword. If rT.halfword[1]!=0, jump to rA. */ void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } @@ -505,7 +594,28 @@ spe_load_int(struct spe_function *p, unsigned rT, int i) } else { spe_ilhu(p, rT, i >> 16); - spe_iohl(p, rT, i & 0xffff); + if (i & 0xffff) + spe_iohl(p, rT, i & 0xffff); + } +} + +void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui) +{ + /* If the whole value is in the lower 18 bits, use ila, which + * doesn't sign-extend. Otherwise, if the two halfwords of + * the constant are identical, use ilh. Otherwise, we have + * to use ilhu followed by iohl. + */ + if ((ui & 0xfffc0000) == ui) { + spe_ila(p, rT, ui); + } + else if ((ui >> 16) == (ui & 0xffff)) { + spe_ilh(p, rT, ui & 0xffff); + } + else { + spe_ilhu(p, rT, ui >> 16); + if (ui & 0xffff) + spe_iohl(p, rT, ui & 0xffff); } } @@ -513,22 +623,29 @@ spe_load_int(struct spe_function *p, unsigned rT, int i) void spe_splat(struct spe_function *p, unsigned rT, unsigned rA) { - spe_ila(p, rT, 66051); + /* Duplicate bytes 0, 1, 2, and 3 across the whole register */ + spe_ila(p, rT, 0x00010203); spe_shufb(p, rT, rA, rA, rT); } void -spe_complement(struct spe_function *p, unsigned rT) +spe_complement(struct spe_function *p, unsigned rT, unsigned rA) { - spe_nor(p, rT, rT, rT); + spe_nor(p, rT, rA, rA); } void spe_move(struct spe_function *p, unsigned rT, unsigned rA) { - spe_ori(p, rT, rA, 0); + /* Use different instructions depending on the instruction address + * to take advantage of the dual pipelines. + */ + if (p->num_inst & 1) + spe_shlqbyi(p, rT, rA, 0); /* odd pipe */ + else + spe_ori(p, rT, rA, 0); /* even pipe */ } @@ -539,4 +656,70 @@ spe_zero(struct spe_function *p, unsigned rT) } +void +spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word) +{ + assert(word >= 0); + assert(word <= 3); + + if (word == 0) { + int tmp1 = rT; + spe_ila(p, tmp1, 66051); + spe_shufb(p, rT, rA, rA, tmp1); + } + else { + /* XXX review this, we may not need the rotqbyi instruction */ + int tmp1 = rT; + int tmp2 = spe_allocate_available_register(p); + + spe_ila(p, tmp1, 66051); + spe_rotqbyi(p, tmp2, rA, 4 * word); + spe_shufb(p, rT, tmp2, tmp2, tmp1); + + spe_release_register(p, tmp2); + } +} + +/** + * For each 32-bit float element of rA and rB, choose the smaller of the + * two, compositing them into the rT register. + * + * The Float Compare Greater Than (fcgt) instruction will put 1s into + * compare_reg where rA > rB, and 0s where rA <= rB. + * + * Then the Select Bits (selb) instruction will take bits from rA where + * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA + * where rA <= rB and from rB where rB > rA, which is exactly the + * "min" operation. + * + * The compare_reg could in many cases be the same as rT, unless + * rT == rA || rt == rB. But since this is common in constructions + * like "x = min(x, a)", we always allocate a new register to be safe. + */ +void +spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) +{ + unsigned int compare_reg = spe_allocate_available_register(p); + spe_fcgt(p, compare_reg, rA, rB); + spe_selb(p, rT, rA, rB, compare_reg); + spe_release_register(p, compare_reg); +} + +/** + * For each 32-bit float element of rA and rB, choose the greater of the + * two, compositing them into the rT register. + * + * The logic is similar to that of spe_float_min() above; the only + * difference is that the registers on spe_selb() have been reversed, + * so that the larger of the two is selected instead of the smaller. + */ +void +spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) +{ + unsigned int compare_reg = spe_allocate_available_register(p); + spe_fcgt(p, compare_reg, rA, rB); + spe_selb(p, rT, rB, rA, compare_reg); + spe_release_register(p, compare_reg); +} + #endif /* GALLIUM_CELL */ diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index d95e5aace3..4165a971a2 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -28,6 +28,7 @@ * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf * * \author Ian Romanick <idr@us.ibm.com> + * \author Brian Paul */ #ifndef RTASM_PPC_SPE_H @@ -63,15 +64,25 @@ struct spe_function * spe_release_register */ uint64_t regs[SPE_NUM_REGS / 64]; + + boolean print; /**< print/dump instructions as they're emitted? */ + int indent; /**< number of spaces to indent */ }; + extern void spe_init_func(struct spe_function *p, unsigned code_size); extern void spe_release_func(struct spe_function *p); +extern unsigned spe_code_size(const struct spe_function *p); extern int spe_allocate_available_register(struct spe_function *p); extern int spe_allocate_register(struct spe_function *p, int reg); extern void spe_release_register(struct spe_function *p, int reg); +extern void spe_print_code(struct spe_function *p, boolean enable); +extern void spe_indent(struct spe_function *p, int spaces); +extern void spe_comment(struct spe_function *p, int rel_indent, const char *s); + + #endif /* RTASM_PPC_SPE_H */ #ifndef EMIT_ @@ -292,13 +303,17 @@ spe_load_float(struct spe_function *p, unsigned rT, float x); extern void spe_load_int(struct spe_function *p, unsigned rT, int i); +/** Load/splat immediate unsigned int into rT. */ +extern void +spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui); + /** Replicate word 0 of rA across rT. */ extern void spe_splat(struct spe_function *p, unsigned rT, unsigned rA); -/** Complement/invert all bits in rT. */ +/** rT = complement_all_bits(rA). */ extern void -spe_complement(struct spe_function *p, unsigned rT); +spe_complement(struct spe_function *p, unsigned rT, unsigned rA); /** rT = rA. */ extern void @@ -308,6 +323,18 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA); extern void spe_zero(struct spe_function *p, unsigned rT); +/** rT = splat(rA, word) */ +extern void +spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word); + +/** rT = float min(rA, rB) */ +extern void +spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB); + +/** rT = float max(rA, rB) */ +extern void +spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB); + /* Floating-point instructions */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c index 74614d3688..38fcaf8829 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c @@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void ) return instruction_ext_nv; } -union token_u32 + +/** test for inequality of 32-bit values pointed to by a and b */ +static INLINE boolean +compare32(const void *a, const void *b) { - unsigned u32; -}; + return *((uint32_t *) a) != *((uint32_t *) b); +} + unsigned tgsi_compare_instruction_ext_nv( @@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_instruction_ext_nv @@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_instruction_ext_label @@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_instruction_ext_texture @@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_src_register_ext_swz @@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_src_register_ext_mod @@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_dst_register_ext_concode @@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_dst_register_ext_modulate diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index df002939c6..1a5294eabc 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -1674,6 +1674,7 @@ exec_declaration( break; default: + eval = NULL; assert( 0 ); } diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c index 3757486ba9..2cd56e413a 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.c +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c @@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens( 1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize; } + +/** + * This function is used to avoid and work-around type punning/aliasing + * warnings. The warnings seem harmless on x86 but on PPC they cause + * real failures. + */ +static INLINE void +copy_token(void *dst, const void *src) +{ + memcpy(dst, src, 4); +} + + +/** + * Get next 4-byte token, return it at address specified by 'token' + */ static void next_token( struct tgsi_parse_context *ctx, void *token ) { assert( !tgsi_parse_end_of_tokens( ctx ) ); - - *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++]; + copy_token(token, &ctx->Tokens[ctx->Position]); + ctx->Position++; } + void tgsi_parse_token( struct tgsi_parse_context *ctx ) @@ -116,7 +133,7 @@ tgsi_parse_token( struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration; *decl = tgsi_default_full_declaration(); - decl->Declaration = *(struct tgsi_declaration *) &token; + copy_token(&decl->Declaration, &token); next_token( ctx, &decl->DeclarationRange ); @@ -132,8 +149,7 @@ tgsi_parse_token( struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate; *imm = tgsi_default_full_immediate(); - imm->Immediate = *(struct tgsi_immediate *) &token; - + copy_token(&imm->Immediate, &token); assert( !imm->Immediate.Extended ); switch (imm->Immediate.DataType) { @@ -158,8 +174,7 @@ tgsi_parse_token( unsigned extended; *inst = tgsi_default_full_instruction(); - inst->Instruction = *(struct tgsi_instruction *) &token; - + copy_token(&inst->Instruction, &token); extended = inst->Instruction.Extended; while( extended ) { @@ -169,18 +184,15 @@ tgsi_parse_token( switch( token.Type ) { case TGSI_INSTRUCTION_EXT_TYPE_NV: - inst->InstructionExtNv = - *(struct tgsi_instruction_ext_nv *) &token; + copy_token(&inst->InstructionExtNv, &token); break; case TGSI_INSTRUCTION_EXT_TYPE_LABEL: - inst->InstructionExtLabel = - *(struct tgsi_instruction_ext_label *) &token; + copy_token(&inst->InstructionExtLabel, &token); break; case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE: - inst->InstructionExtTexture = - *(struct tgsi_instruction_ext_texture *) &token; + copy_token(&inst->InstructionExtTexture, &token); break; default: @@ -212,13 +224,13 @@ tgsi_parse_token( switch( token.Type ) { case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE: - inst->FullDstRegisters[i].DstRegisterExtConcode = - *(struct tgsi_dst_register_ext_concode *) &token; + copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode, + &token); break; case TGSI_DST_REGISTER_EXT_TYPE_MODULATE: - inst->FullDstRegisters[i].DstRegisterExtModulate = - *(struct tgsi_dst_register_ext_modulate *) &token; + copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate, + &token); break; default: @@ -245,13 +257,13 @@ tgsi_parse_token( switch( token.Type ) { case TGSI_SRC_REGISTER_EXT_TYPE_SWZ: - inst->FullSrcRegisters[i].SrcRegisterExtSwz = - *(struct tgsi_src_register_ext_swz *) &token; + copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz, + &token); break; case TGSI_SRC_REGISTER_EXT_TYPE_MOD: - inst->FullSrcRegisters[i].SrcRegisterExtMod = - *(struct tgsi_src_register_ext_mod *) &token; + copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod, + &token); break; default: diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index 9adf72944e..d28201ac8d 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -104,6 +104,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->rasterizer.cull_mode = PIPE_WINDING_NONE; ctx->rasterizer.bypass_clipping = 1; /*ctx->rasterizer.bypass_vs = 1;*/ + ctx->rasterizer.gl_rasterization_rules = 1; /* samplers */ memset(&ctx->sampler, 0, sizeof(ctx->sampler)); diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c index b19a649bbc..9d305ad763 100644 --- a/src/gallium/auxiliary/util/u_gen_mipmap.c +++ b/src/gallium/auxiliary/util/u_gen_mipmap.c @@ -725,6 +725,7 @@ util_create_gen_mipmap(struct pipe_context *pipe, ctx->rasterizer.cull_mode = PIPE_WINDING_NONE; ctx->rasterizer.bypass_clipping = 1; /*ctx->rasterizer.bypass_vs = 1;*/ + ctx->rasterizer.gl_rasterization_rules = 1; /* sampler state */ memset(&ctx->sampler, 0, sizeof(ctx->sampler)); diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index cb0631baf5..f0ff96eb47 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -104,11 +104,11 @@ #define CELL_BUFFER_STATUS_FREE 10 #define CELL_BUFFER_STATUS_USED 20 - -#define CELL_DEBUG_CHECKER (1 << 0) -#define CELL_DEBUG_SYNC (1 << 1) - - +#define CELL_DEBUG_CHECKER (1 << 0) +#define CELL_DEBUG_ASM (1 << 1) +#define CELL_DEBUG_SYNC (1 << 2) +#define CELL_DEBUG_FRAGMENT_OPS (1 << 3) +#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4) /** Max instructions for doing per-fragment operations */ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 71f1a3049d..62e213ea35 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -85,13 +85,14 @@ cell_draw_create(struct cell_context *cell) } -#ifdef DEBUG static const struct debug_named_value cell_debug_flags[] = { {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */ + {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */ {"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */ + {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/ + {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/ {NULL, 0} }; -#endif struct pipe_context * diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index 6ffe94eb14..1bc803d590 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -51,9 +51,13 @@ #include "cell_gen_fp.h" -/** Set to 1 to enable debug/disassembly printfs */ -#define DISASSEM 01 +#define MAX_TEMPS 16 +#define MAX_IMMED 8 +#define CHAN_X 0 +#define CHAN_Y 1 +#define CHAN_Z 2 +#define CHAN_W 3 /** * Context needed during code generation. @@ -63,13 +67,21 @@ struct codegen int inputs_reg; /**< 1st function parameter */ int outputs_reg; /**< 2nd function parameter */ int constants_reg; /**< 3rd function parameter */ - int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */ + int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */ + int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */ + + int num_imm; /**< number of immediates */ int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */ /** Per-instruction temps / intermediate temps */ int num_itemps; - int itemps[3]; + int itemps[10]; + + /** Current IF/ELSE/ENDIF nesting level */ + int if_nesting; + /** Index of execution mask register */ + int exec_mask_reg; struct spe_function *f; boolean error; @@ -112,19 +124,47 @@ get_const_one_reg(struct codegen *gen) { if (gen->one_reg <= 0) { gen->one_reg = spe_allocate_available_register(gen->f); - } - /* one = {1.0, 1.0, 1.0, 1.0} */ - spe_load_float(gen->f, gen->one_reg, 1.0f); -#if DISASSEM - printf("il\tr%d, 1.0f\n", gen->one_reg); -#endif + spe_indent(gen->f, 4); + spe_comment(gen->f, -4, "INIT CONSTANT 1.0:"); + + /* one = {1.0, 1.0, 1.0, 1.0} */ + spe_load_float(gen->f, gen->one_reg, 1.0f); + + spe_indent(gen->f, -4); + } return gen->one_reg; } /** + * Return index of the pixel execution mask. + * The register is allocated an initialized upon the first call. + * + * The pixel execution mask controls which pixels in a quad are + * modified, according to surrounding conditionals, loops, etc. + */ +static int +get_exec_mask_reg(struct codegen *gen) +{ + if (gen->exec_mask_reg <= 0) { + gen->exec_mask_reg = spe_allocate_available_register(gen->f); + + spe_indent(gen->f, 4); + spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:"); + + /* exec_mask = {~0, ~0, ~0, ~0} */ + spe_load_int(gen->f, gen->exec_mask_reg, ~0); + + spe_indent(gen->f, -4); + } + + return gen->exec_mask_reg; +} + + +/** * Return the index of the SPU temporary containing the named TGSI * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we * just return the corresponding SPE register. If the TGIS register @@ -136,12 +176,15 @@ get_src_reg(struct codegen *gen, int channel, const struct tgsi_full_src_register *src) { - int reg; + int reg = -1; + int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel); + boolean reg_is_itemp = FALSE; + uint sign_op; - /* XXX need to examine src swizzle info here. - * That will involve changing the channel var... - */ + assert(swizzle >= 0); + assert(swizzle <= 3); + channel = swizzle; switch (src->SrcRegister.File) { case TGSI_FILE_TEMPORARY: @@ -152,21 +195,57 @@ get_src_reg(struct codegen *gen, /* offset is measured in quadwords, not bytes */ int offset = src->SrcRegister.Index * 4 + channel; reg = get_itemp(gen); + reg_is_itemp = TRUE; /* Load: reg = memory[(machine_reg) + offset] */ spe_lqd(gen->f, reg, gen->inputs_reg, offset); -#if DISASSEM - printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset); -#endif } break; case TGSI_FILE_IMMEDIATE: - /* xxx fall-through for now / fix */ + reg = gen->imm_regs[src->SrcRegister.Index][channel]; + break; case TGSI_FILE_CONSTANT: /* xxx fall-through for now / fix */ default: assert(0); } + /* + * Handle absolute value, negate or set-negative of src register. + */ + sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel); + if (sign_op != TGSI_UTIL_SIGN_KEEP) { + /* + * All sign ops are done by manipulating bit 31, the IEEE float sign bit. + */ + const int bit31mask_reg = get_itemp(gen); + int result_reg; + + if (reg_is_itemp) { + /* re-use 'reg' for the result */ + result_reg = reg; + } + else { + /* alloc a new reg for the result */ + result_reg = get_itemp(gen); + } + + /* mask with bit 31 set, the rest cleared */ + spe_load_int(gen->f, bit31mask_reg, (1 << 31)); + + if (sign_op == TGSI_UTIL_SIGN_CLEAR) { + spe_andc(gen->f, result_reg, reg, bit31mask_reg); + } + else if (sign_op == TGSI_UTIL_SIGN_SET) { + spe_and(gen->f, result_reg, reg, bit31mask_reg); + } + else { + assert(sign_op == TGSI_UTIL_SIGN_TOGGLE); + spe_xor(gen->f, result_reg, reg, bit31mask_reg); + } + + reg = result_reg; + } + return reg; } @@ -183,11 +262,14 @@ get_dst_reg(struct codegen *gen, int channel, const struct tgsi_full_dst_register *dest) { - int reg; + int reg = -1; switch (dest->DstRegister.File) { case TGSI_FILE_TEMPORARY: - reg = gen->temp_regs[dest->DstRegister.Index][channel]; + if (gen->if_nesting > 0) + reg = get_itemp(gen); + else + reg = gen->temp_regs[dest->DstRegister.Index][channel]; break; case TGSI_FILE_OUTPUT: reg = get_itemp(gen); @@ -213,17 +295,40 @@ store_dest_reg(struct codegen *gen, { switch (dest->DstRegister.File) { case TGSI_FILE_TEMPORARY: - /* no-op */ + if (gen->if_nesting > 0) { + int d_reg = gen->temp_regs[dest->DstRegister.Index][channel]; + int exec_reg = get_exec_mask_reg(gen); + /* Mix d with new value according to exec mask: + * d[i] = mask_reg[i] ? value_reg : d_reg + */ + spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg); + } + else { + /* we're not inside a condition or loop: do nothing special */ + } break; case TGSI_FILE_OUTPUT: { /* offset is measured in quadwords, not bytes */ int offset = dest->DstRegister.Index * 4 + channel; - /* Store: memory[(machine_reg) + offset] = reg */ - spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); -#if DISASSEM - printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset); -#endif + if (gen->if_nesting > 0) { + int exec_reg = get_exec_mask_reg(gen); + int curval_reg = get_itemp(gen); + /* First read the current value from memory: + * Load: curval = memory[(machine_reg) + offset] + */ + spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset); + /* Mix curval with newvalue according to exec mask: + * d[i] = mask_reg[i] ? value_reg : d_reg + */ + spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg); + /* Store: memory[(machine_reg) + offset] = curval */ + spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset); + } + else { + /* Store: memory[(machine_reg) + offset] = reg */ + spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); + } } break; default: @@ -236,15 +341,13 @@ static boolean emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "MOV:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); /* XXX we don't always need to actually emit a mov instruction here */ spe_move(gen->f, dst_reg, src_reg); -#if DISASSEM - printf("mov\tr%d, r%d\n", dst_reg, src_reg); -#endif store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); } @@ -253,6 +356,7 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) } + /** * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD * becomes (up to) four SPU "fa" instructions because we're doing SOA @@ -262,6 +366,7 @@ static boolean emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "ADD:"); /* Loop over Red/Green/Blue/Alpha channels */ for (ch = 0; ch < 4; ch++) { /* If the dest R, G, B or A writemask is enabled... */ @@ -273,9 +378,6 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) /* Emit actual SPE instruction: d = s1 + s2 */ spe_fa(gen->f, d_reg, s1_reg, s2_reg); -#if DISASSEM - printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); -#endif /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); @@ -286,6 +388,82 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) return true; } +/** + * Emit subtract. See emit_ADD for comments. + */ +static boolean +emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "SUB:"); + /* Loop over Red/Green/Blue/Alpha channels */ + for (ch = 0; ch < 4; ch++) { + /* If the dest R, G, B or A writemask is enabled... */ + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + /* get indexes of the two src, one dest SPE registers */ + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* Emit actual SPE instruction: d = s1 - s2 */ + spe_fs(gen->f, d_reg, s1_reg, s2_reg); + + /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + /* Free any intermediate temps we allocated */ + free_itemps(gen); + } + } + return true; +} + +/** + * Emit multiply add. See emit_ADD for comments. + */ +static boolean +emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "MAD:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = s1 * s2 + s3 */ + spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit linear interpolate. See emit_ADD for comments. + */ +static boolean +emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "LERP:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = s3 + s1(s2 - s3) */ + spe_fs(gen->f, d_reg, s2_reg, s3_reg); + spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} /** * Emit multiply. See emit_ADD for comments. @@ -294,6 +472,7 @@ static boolean emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "MUL:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); @@ -301,9 +480,6 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); /* d = s1 * s2 */ spe_fm(gen->f, d_reg, s1_reg, s2_reg); -#if DISASSEM - printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); -#endif store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); } @@ -311,6 +487,151 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) return true; } +/** + * Emit reciprocal. See emit_ADD for comments. + */ +static boolean +emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "RCP:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = 1/s1 */ + spe_frest(gen->f, d_reg, s1_reg); + spe_fi(gen->f, d_reg, s1_reg, d_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + +/** + * Emit reciprocal sqrt. See emit_ADD for comments. + */ +static boolean +emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "RSQ:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = 1/s1 */ + spe_frsqest(gen->f, d_reg, s1_reg); + spe_fi(gen->f, d_reg, s1_reg, d_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + +/** + * Emit absolute value. See emit_ADD for comments. + */ +static boolean +emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "ABS:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + const int bit31mask_reg = get_itemp(gen); + + /* mask with bit 31 set, the rest cleared */ + spe_load_int(gen->f, bit31mask_reg, (1 << 31)); + + /* d = sign bit cleared in s1 */ + spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + +/** + * Emit 3 component dot product. See emit_ADD for comments. + */ +static boolean +emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "DP3:"); + + int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]); + /* d = x * x */ + spe_fm(gen->f, d_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* d = y * y + d */ + spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* d = z * z + d */ + spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + } + } + + free_itemps(gen); + return true; +} + +/** + * Emit 4 component dot product. See emit_ADD for comments. + */ +static boolean +emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "DP3:"); + + int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]); + /* d = x * x */ + spe_fm(gen->f, d_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* d = y * y + d */ + spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* d = z * z + d */ + spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg); + + s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]); + /* d = w * w + d */ + spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + } + } + + free_itemps(gen); + return true; +} /** * Emit set-if-greater-than. @@ -323,6 +644,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "SGT:"); + for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); @@ -331,16 +654,10 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) /* d = (s1 > s2) */ spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); -#if DISASSEM - printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); -#endif /* convert d from 0x0/0xffffffff to 0.0/1.0 */ /* d = d & one_reg */ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); -#if DISASSEM - printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen)); -#endif store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); @@ -350,6 +667,421 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) return true; } +/** + * Emit set-if_less-then. See emit_SGT for comments. + */ +static boolean +emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SLT:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 < s2) */ + spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_greater-then-or-equal. See emit_SGT for comments. + */ +static boolean +emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SGE:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 >= s2) */ + spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = ~d & one_reg */ + spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_less-then-or-equal. See emit_SGT for comments. + */ +static boolean +emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SLE:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 <= s2) */ + spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = ~d & one_reg */ + spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_equal. See emit_SGT for comments. + */ +static boolean +emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SEQ:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 == s2) */ + spe_fceq(gen->f, d_reg, s1_reg, s2_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_not_equal. See emit_SGT for comments. + */ +static boolean +emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SNE:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 != s2) */ + spe_fceq(gen->f, d_reg, s1_reg, s2_reg); + spe_nor(gen->f, d_reg, d_reg, d_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit compare. See emit_SGT for comments. + */ +static boolean +emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "CMP:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int zero_reg = get_itemp(gen); + + spe_xor(gen->f, zero_reg, zero_reg, zero_reg); + + /* d = (s1 < 0) ? s2 : s3 */ + spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); + spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit floor. + * If negative int subtract one + * Convert float to signed int + * Convert signed int to float + */ +static boolean +emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "FLR:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int tmp_reg = get_itemp(gen); + + /* If negative, subtract 1.0 */ + spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg); + spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg); + spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg); + spe_fs(gen->f, d_reg, s1_reg, tmp_reg); + + /* Convert float to int */ + spe_cflts(gen->f, d_reg, d_reg, 0); + + /* Convert int to float */ + spe_csflt(gen->f, d_reg, d_reg, 0); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit frac. + * Input - FLR(Input) + */ +static boolean +emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "FLR:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int tmp_reg = get_itemp(gen); + + /* If negative, subtract 1.0 */ + spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg); + spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg); + spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg); + spe_fs(gen->f, d_reg, s1_reg, tmp_reg); + + /* Convert float to int */ + spe_cflts(gen->f, d_reg, d_reg, 0); + + /* Convert int to float */ + spe_csflt(gen->f, d_reg, d_reg, 0); + + /* d = s1 - FLR(s1) */ + spe_fs(gen->f, d_reg, s1_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + + +/** + * Emit max. See emit_SGT for comments. + */ +static boolean +emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "MAX:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 > s2) ? s1 : s2 */ + spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); + spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit max. See emit_SGT for comments. + */ +static boolean +emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "MIN:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s2 > s1) ? s1 : s2 */ + spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); + spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +static boolean +emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int channel = 0; + const int exec_reg = get_exec_mask_reg(gen); + + spe_comment(gen->f, -4, "IF:"); + + /* update execution mask with the predicate register */ + int tmp_reg = get_itemp(gen); + int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]); + + /* tmp = (s1_reg == 0) */ + spe_ceqi(gen->f, tmp_reg, s1_reg, 0); + /* tmp = !tmp */ + spe_complement(gen->f, tmp_reg, tmp_reg); + /* exec_mask = exec_mask & tmp */ + spe_and(gen->f, exec_reg, exec_reg, tmp_reg); + + gen->if_nesting++; + + free_itemps(gen); + + return true; +} + + +static boolean +emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int exec_reg = get_exec_mask_reg(gen); + + spe_comment(gen->f, -4, "ELSE:"); + + /* exec_mask = !exec_mask */ + spe_complement(gen->f, exec_reg, exec_reg); + + return true; +} + + +static boolean +emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int exec_reg = get_exec_mask_reg(gen); + + spe_comment(gen->f, -4, "ENDIF:"); + + /* XXX todo: pop execution mask */ + + spe_load_int(gen->f, exec_reg, ~0x0); + + gen->if_nesting--; + return true; +} + + +static boolean +emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst, + boolean ddx) +{ + int ch; + + spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + int t1_reg = get_itemp(gen); + int t2_reg = get_itemp(gen); + + spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */ + if (ddx) { + spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */ + } + else { + spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */ + } + spe_fs(gen->f, d_reg, t2_reg, t1_reg); + + free_itemps(gen); + } + } + + return true; +} + + + /** * Emit END instruction. @@ -361,11 +1093,9 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_END(struct codegen *gen) { + spe_comment(gen->f, -4, "END:"); /* return from function call */ spe_bi(gen->f, SPE_REG_RA, 0, 0); -#if DISASSEM - printf("bi\trRA\n"); -#endif return true; } @@ -384,14 +1114,64 @@ emit_instruction(struct codegen *gen, return emit_MUL(gen, inst); case TGSI_OPCODE_ADD: return emit_ADD(gen, inst); + case TGSI_OPCODE_SUB: + return emit_SUB(gen, inst); + case TGSI_OPCODE_MAD: + return emit_MAD(gen, inst); + case TGSI_OPCODE_LERP: + return emit_LERP(gen, inst); + case TGSI_OPCODE_DP3: + return emit_DP3(gen, inst); + case TGSI_OPCODE_DP4: + return emit_DP4(gen, inst); + case TGSI_OPCODE_RCP: + return emit_RCP(gen, inst); + case TGSI_OPCODE_RSQ: + return emit_RSQ(gen, inst); + case TGSI_OPCODE_ABS: + return emit_ABS(gen, inst); case TGSI_OPCODE_SGT: return emit_SGT(gen, inst); + case TGSI_OPCODE_SLT: + return emit_SLT(gen, inst); + case TGSI_OPCODE_SGE: + return emit_SGE(gen, inst); + case TGSI_OPCODE_SLE: + return emit_SLE(gen, inst); + case TGSI_OPCODE_SEQ: + return emit_SEQ(gen, inst); + case TGSI_OPCODE_SNE: + return emit_SNE(gen, inst); + case TGSI_OPCODE_CMP: + return emit_CMP(gen, inst); + case TGSI_OPCODE_MAX: + return emit_MAX(gen, inst); + case TGSI_OPCODE_MIN: + return emit_MIN(gen, inst); + case TGSI_OPCODE_FLR: + return emit_FLR(gen, inst); + case TGSI_OPCODE_FRC: + return emit_FRC(gen, inst); case TGSI_OPCODE_END: return emit_END(gen); + case TGSI_OPCODE_IF: + return emit_IF(gen, inst); + case TGSI_OPCODE_ELSE: + return emit_ELSE(gen, inst); + case TGSI_OPCODE_ENDIF: + return emit_ENDIF(gen, inst); + + case TGSI_OPCODE_DDX: + return emit_DDX_DDY(gen, inst, true); + case TGSI_OPCODE_DDY: + return emit_DDX_DDY(gen, inst, false); + /* XXX lots more cases to do... */ default: + fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n", + inst->Instruction.Opcode); return false; } @@ -401,45 +1181,88 @@ emit_instruction(struct codegen *gen, /** + * Emit code for a TGSI immediate value (vector of four floats). + * This involves register allocation and initialization. + * XXX the initialization should be done by a "prepare" stage, not + * per quad execution! + */ +static boolean +emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) +{ + int ch; + + assert(gen->num_imm < MAX_TEMPS); + + spe_comment(gen->f, -4, "IMMEDIATE:"); + + for (ch = 0; ch < 4; ch++) { + float val = immed->u.ImmediateFloat32[ch].Float; + int reg = spe_allocate_available_register(gen->f); + + if (reg < 0) + return false; + + /* update immediate map */ + gen->imm_regs[gen->num_imm][ch] = reg; + + /* emit initializer instruction */ + spe_load_float(gen->f, reg, val); + } + + gen->num_imm++; + + return true; +} + + + +/** * Emit "code" for a TGSI declaration. * We only care about TGSI TEMPORARY register declarations at this time. * For each TGSI TEMPORARY we allocate four SPE registers. */ -static void -emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl) +static boolean +emit_declaration(struct cell_context *cell, + struct codegen *gen, const struct tgsi_full_declaration *decl) { int i, ch; switch (decl->Declaration.File) { case TGSI_FILE_TEMPORARY: -#if DISASSEM - printf("Declare temp reg %d .. %d\n", - decl->DeclarationRange.First, - decl->DeclarationRange.Last); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf("Declare temp reg %d .. %d\n", + decl->DeclarationRange.First, + decl->DeclarationRange.Last); + } + for (i = decl->DeclarationRange.First; i <= decl->DeclarationRange.Last; i++) { + assert(i < MAX_TEMPS); for (ch = 0; ch < 4; ch++) { gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); + if (gen->temp_regs[i][ch] < 0) + return false; /* out of regs */ } /* XXX if we run out of SPE registers, we need to spill * to SPU memory. someday... */ -#if DISASSEM - printf(" SPE regs: %d %d %d %d\n", - gen->temp_regs[i][0], - gen->temp_regs[i][1], - gen->temp_regs[i][2], - gen->temp_regs[i][3]); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf(" SPE regs: %d %d %d %d\n", + gen->temp_regs[i][0], + gen->temp_regs[i][1], + gen->temp_regs[i][2], + gen->temp_regs[i][3]); + } } break; default: ; /* ignore */ } + + return true; } @@ -472,10 +1295,12 @@ cell_gen_fragment_program(struct cell_context *cell, spe_allocate_register(f, gen.outputs_reg); spe_allocate_register(f, gen.constants_reg); -#if DISASSEM - printf("Begin %s\n", __FUNCTION__); - tgsi_dump(tokens, 0); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_print_code(f, true); + spe_indent(f, 8); + printf("Begin %s\n", __FUNCTION__); + tgsi_dump(tokens, 0); + } tgsi_parse_init(&parse, tokens); @@ -484,25 +1309,22 @@ cell_gen_fragment_program(struct cell_context *cell, switch (parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: -#if 0 - if (!note_immediate(&gen, &parse.FullToken.FullImmediate )) - goto fail; -#endif + if (!emit_immediate(&gen, &parse.FullToken.FullImmediate)) + gen.error = true; break; case TGSI_TOKEN_TYPE_DECLARATION: - emit_declaration(&gen, &parse.FullToken.FullDeclaration); + if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration)) + gen.error = true; break; case TGSI_TOKEN_TYPE_INSTRUCTION: - if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) { + if (!emit_instruction(&gen, &parse.FullToken.FullInstruction)) gen.error = true; - } break; default: assert(0); - } } @@ -512,10 +1334,10 @@ cell_gen_fragment_program(struct cell_context *cell, return emit_END(&gen); } -#if DISASSEM - printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); - printf("End %s\n", __FUNCTION__); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); + printf("End %s\n", __FUNCTION__); + } tgsi_parse_free( &parse ); diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 06219d4e98..1837b4c79b 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -60,6 +60,9 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, struct spe_function *f, int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg) { + /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_ + * quantities. This only makes a difference for 32-bit Z values though. + */ ASSERT(dsa->depth.enabled); switch (dsa->depth.func) { @@ -79,28 +82,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, case PIPE_FUNC_GREATER: /* zmask = (ifragZ > ref) */ - spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); /* mask = (mask & zmask) */ spe_and(f, mask_reg, mask_reg, zmask_reg); break; case PIPE_FUNC_LESS: /* zmask = (ref > ifragZ) */ - spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); /* mask = (mask & zmask) */ spe_and(f, mask_reg, mask_reg, zmask_reg); break; case PIPE_FUNC_LEQUAL: /* zmask = (ifragZ > ref) */ - spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); /* mask = (mask & ~zmask) */ spe_andc(f, mask_reg, mask_reg, zmask_reg); break; case PIPE_FUNC_GEQUAL: /* zmask = (ref > ifragZ) */ - spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); /* mask = (mask & ~zmask) */ spe_andc(f, mask_reg, mask_reg, zmask_reg); break; @@ -229,7 +232,27 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, spe_release_register(f, amask_reg); } +/* This pair of functions is used inline to allocate and deallocate + * optional constant registers. Once a constant is discovered to be + * needed, we will likely need it again, so we don't want to deallocate + * it and have to allocate and load it again unnecessarily. + */ +static inline void +setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value) +{ + if (*is_already_set) return; + *r = spe_allocate_available_register(f); + spe_load_float(f, *r, value); + *is_already_set = true; +} +static inline void +release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r) +{ + if (!*is_already_set) return; + spe_release_register(f, r); + *is_already_set = false; +} /** * Generate SPE code to implement the given blend mode for a quad of pixels. @@ -242,6 +265,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, */ static void gen_blend(const struct pipe_blend_state *blend, + const struct pipe_blend_color *blend_color, struct spe_function *f, enum pipe_format color_format, int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg, @@ -262,10 +286,17 @@ gen_blend(const struct pipe_blend_state *blend, int fbB_reg = spe_allocate_available_register(f); int fbA_reg = spe_allocate_available_register(f); - int one_reg = spe_allocate_available_register(f); int tmp_reg = spe_allocate_available_register(f); - boolean one_reg_set = false; /* avoid setting one_reg more than once */ + /* Optional constant registers we might or might not end up using; + * if we do use them, make sure we only allocate them once by + * keeping a flag on each one. + */ + boolean one_reg_set = false; + unsigned int one_reg; + boolean constR_reg_set = false, constG_reg_set = false, + constB_reg_set = false, constA_reg_set = false; + unsigned int constR_reg, constG_reg, constB_reg, constA_reg; ASSERT(blend->blend_enable); @@ -346,127 +377,449 @@ gen_blend(const struct pipe_blend_state *blend, spe_release_register(f, mask_reg); } - /* - * Compute Src RGB terms + * Compute Src RGB terms. We're actually looking for the value + * of (the appropriate RGB factors) * (the incoming source RGB color), + * because in some cases (like PIPE_BLENDFACTOR_ONE and + * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math. */ switch (blend->rgb_src_factor) { case PIPE_BLENDFACTOR_ONE: + /* factors = (1,1,1), so term = (R,G,B) */ spe_move(f, term1R_reg, fragR_reg); spe_move(f, term1G_reg, fragG_reg); spe_move(f, term1B_reg, fragB_reg); break; case PIPE_BLENDFACTOR_ZERO: - spe_zero(f, term1R_reg); - spe_zero(f, term1G_reg); - spe_zero(f, term1B_reg); + /* factors = (0,0,0), so term = (0,0,0) */ + spe_load_float(f, term1R_reg, 0.0f); + spe_load_float(f, term1G_reg, 0.0f); + spe_load_float(f, term1B_reg, 0.0f); break; case PIPE_BLENDFACTOR_SRC_COLOR: + /* factors = (R,G,B), so term = (R*R, G*G, B*B) */ spe_fm(f, term1R_reg, fragR_reg, fragR_reg); spe_fm(f, term1G_reg, fragG_reg, fragG_reg); spe_fm(f, term1B_reg, fragB_reg, fragB_reg); break; case PIPE_BLENDFACTOR_SRC_ALPHA: + /* factors = (A,A,A), so term = (R*A, G*A, B*A) */ spe_fm(f, term1R_reg, fragR_reg, fragA_reg); spe_fm(f, term1G_reg, fragG_reg, fragA_reg); spe_fm(f, term1B_reg, fragB_reg, fragA_reg); break; - /* XXX more cases */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) + * or in other words term = (R-R*R, G-G*G, B-B*B) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */ + spe_fm(f, term1R_reg, fragR_reg, fbR_reg); + spe_fm(f, term1G_reg, fragG_reg, fbG_reg); + spe_fm(f, term1B_reg, fragB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) + * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) + * or term = (R-R*A,G-G*A,B-B*A) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */ + spe_fm(f, term1R_reg, fragR_reg, fbA_reg); + spe_fm(f, term1G_reg, fragG_reg, fbA_reg); + spe_fm(f, term1B_reg, fragB_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) + * or term = (R-R*Afb,G-G*Afb,b-B*Afb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */ + spe_fm(f, term1R_reg, fragR_reg, constR_reg); + spe_fm(f, term1G_reg, fragG_reg, constG_reg); + spe_fm(f, term1B_reg, fragB_reg, constB_reg); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + /* we'll need the optional constant alpha register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */ + spe_fm(f, term1R_reg, fragR_reg, constA_reg); + spe_fm(f, term1G_reg, fragG_reg, constA_reg); + spe_fm(f, term1B_reg, fragB_reg, constA_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) + * or term = (R-R*Rc, G-G*Gc, B-B*Bc) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) + * or term = (R-R*Ac,G-G*Ac,B-B*Ac) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + /* We'll need the optional {1,1,1,1} register */ + setup_const_register(f, &one_reg_set, &one_reg, 1.0f); + /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so + * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb)) + * We could expand the term (as a*min(b,c) == min(a*b,a*c) + * as long as a is positive), but then we'd have to do three + * spe_float_min() functions instead of one, so this is simpler. + */ + /* tmp = 1 - Afb */ + spe_fs(f, tmp_reg, one_reg, fbA_reg); + /* tmp = min(A,tmp) */ + spe_float_min(f, tmp_reg, fragA_reg, tmp_reg); + /* term = R*tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: ASSERT(0); } /* - * Compute Src Alpha term + * Compute Src Alpha term. Like the above, we're looking for + * the full term A*factor, not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. */ switch (blend->alpha_src_factor) { + case PIPE_BLENDFACTOR_ZERO: + /* factor = 0, so term = 0 */ + spe_load_float(f, term1A_reg, 0.0f); + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */ case PIPE_BLENDFACTOR_ONE: + /* factor = 1, so term = A */ spe_move(f, term1A_reg, fragA_reg); break; + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factor = A, so term = A*A */ spe_fm(f, term1A_reg, fragA_reg, fragA_reg); break; case PIPE_BLENDFACTOR_SRC_ALPHA: spe_fm(f, term1A_reg, fragA_reg, fragA_reg); break; - /* XXX more cases */ + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factor = 1-A, so term = A*(1-A) = A-A*A */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_DST_COLOR: + /* factor = Afb, so term = A*Afb */ + spe_fm(f, term1A_reg, fragA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = Ac, so term = A*Ac */ + spe_fm(f, term1A_reg, fragA_reg, constA_reg); + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: default: ASSERT(0); } /* - * Compute Dest RGB terms + * Compute Dest RGB term. Like the above, we're looking for + * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. */ switch (blend->rgb_dst_factor) { case PIPE_BLENDFACTOR_ONE: + /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */ spe_move(f, term2R_reg, fbR_reg); spe_move(f, term2G_reg, fbG_reg); spe_move(f, term2B_reg, fbB_reg); break; case PIPE_BLENDFACTOR_ZERO: - spe_zero(f, term2R_reg); - spe_zero(f, term2G_reg); - spe_zero(f, term2B_reg); + /* factor s= (0,0,0), so term = (0,0,0) */ + spe_load_float(f, term2R_reg, 0.0f); + spe_load_float(f, term2G_reg, 0.0f); + spe_load_float(f, term2B_reg, 0.0f); break; case PIPE_BLENDFACTOR_SRC_COLOR: + /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */ spe_fm(f, term2R_reg, fbR_reg, fragR_reg); spe_fm(f, term2G_reg, fbG_reg, fragG_reg); spe_fm(f, term2B_reg, fbB_reg, fragB_reg); break; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) + * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg); + break; case PIPE_BLENDFACTOR_SRC_ALPHA: + /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */ spe_fm(f, term2R_reg, fbR_reg, fragA_reg); spe_fm(f, term2G_reg, fbG_reg, fragA_reg); spe_fm(f, term2B_reg, fbB_reg, fragA_reg); break; case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - /* one = {1.0, 1.0, 1.0, 1.0} */ - if (!one_reg_set) { - spe_load_float(f, one_reg, 1.0f); - one_reg_set = true; - } - /* tmp = one - fragA */ - spe_fs(f, tmp_reg, one_reg, fragA_reg); - /* term = fb * tmp */ - spe_fm(f, term2R_reg, fbR_reg, tmp_reg); - spe_fm(f, term2G_reg, fbG_reg, tmp_reg); - spe_fm(f, term2B_reg, fbB_reg, tmp_reg); - break; - /* XXX more cases */ + /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */ + spe_fm(f, term2R_reg, fbR_reg, fbR_reg); + spe_fm(f, term2G_reg, fbG_reg, fbG_reg); + spe_fm(f, term2B_reg, fbB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb)) + * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: + /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */ + spe_fm(f, term2R_reg, fbR_reg, fbA_reg); + spe_fm(f, term2G_reg, fbG_reg, fbA_reg); + spe_fm(f, term2B_reg, fbB_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) + * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */ + spe_fm(f, term2R_reg, fbR_reg, constR_reg); + spe_fm(f, term2G_reg, fbG_reg, constG_reg); + spe_fm(f, term2B_reg, fbB_reg, constB_reg); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + /* we'll need the optional constant alpha register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */ + spe_fm(f, term2R_reg, fbR_reg, constA_reg); + spe_fm(f, term2G_reg, fbG_reg, constA_reg); + spe_fm(f, term2B_reg, fbB_reg, constA_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) + * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac)) + * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */ + ASSERT(0); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: ASSERT(0); } /* - * Compute Dest Alpha term + * Compute Dest Alpha term. Like the above, we're looking for + * the full term Afb*factor, not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. */ switch (blend->alpha_dst_factor) { case PIPE_BLENDFACTOR_ONE: + /* factor = 1, so term = Afb */ spe_move(f, term2A_reg, fbA_reg); break; case PIPE_BLENDFACTOR_ZERO: - spe_zero(f, term2A_reg); + /* factor = 0, so term = 0 */ + spe_load_float(f, term2A_reg, 0.0f); break; - case PIPE_BLENDFACTOR_SRC_ALPHA: + + case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factor = A, so term = Afb*A */ spe_fm(f, term2A_reg, fbA_reg, fragA_reg); break; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - /* one = {1.0, 1.0, 1.0, 1.0} */ - if (!one_reg_set) { - spe_load_float(f, one_reg, 1.0f); - one_reg_set = true; - } - /* tmp = one - fragA */ - spe_fs(f, tmp_reg, one_reg, fragA_reg); - /* termA = fbA * tmp */ - spe_fm(f, term2A_reg, fbA_reg, tmp_reg); + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_DST_COLOR: + /* factor = Afb, so term = Afb*Afb */ + spe_fm(f, term2A_reg, fbA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = Ac, so term = Afb*Ac */ + spe_fm(f, term2A_reg, fbA_reg, constA_reg); + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg); break; - /* XXX more cases */ + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */ + ASSERT(0); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: default: ASSERT(0); } /* - * Combine Src/Dest RGB terms + * Combine Src/Dest RGB terms as per the blend equation. */ switch (blend->rgb_func) { case PIPE_BLEND_ADD: @@ -479,7 +832,21 @@ gen_blend(const struct pipe_blend_state *blend, spe_fs(f, fragG_reg, term1G_reg, term2G_reg); spe_fs(f, fragB_reg, term1B_reg, term2B_reg); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragR_reg, term2R_reg, term1R_reg); + spe_fs(f, fragG_reg, term2G_reg, term1G_reg); + spe_fs(f, fragB_reg, term2B_reg, term1B_reg); + break; + case PIPE_BLEND_MIN: + spe_float_min(f, fragR_reg, term1R_reg, term2R_reg); + spe_float_min(f, fragG_reg, term1G_reg, term2G_reg); + spe_float_min(f, fragB_reg, term1B_reg, term2B_reg); + break; + case PIPE_BLEND_MAX: + spe_float_max(f, fragR_reg, term1R_reg, term2R_reg); + spe_float_max(f, fragG_reg, term1G_reg, term2G_reg); + spe_float_max(f, fragB_reg, term1B_reg, term2B_reg); + break; default: ASSERT(0); } @@ -494,7 +861,15 @@ gen_blend(const struct pipe_blend_state *blend, case PIPE_BLEND_SUBTRACT: spe_fs(f, fragA_reg, term1A_reg, term2A_reg); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragA_reg, term2A_reg, term1A_reg); + break; + case PIPE_BLEND_MIN: + spe_float_min(f, fragA_reg, term1A_reg, term2A_reg); + break; + case PIPE_BLEND_MAX: + spe_float_max(f, fragA_reg, term1A_reg, term2A_reg); + break; default: ASSERT(0); } @@ -514,8 +889,14 @@ gen_blend(const struct pipe_blend_state *blend, spe_release_register(f, fbB_reg); spe_release_register(f, fbA_reg); - spe_release_register(f, one_reg); spe_release_register(f, tmp_reg); + + /* Free any optional registers that actually got used */ + release_const_register(f, &one_reg_set, one_reg); + release_const_register(f, &constR_reg_set, constR_reg); + release_const_register(f, &constG_reg_set, constG_reg); + release_const_register(f, &constB_reg_set, constB_reg); + release_const_register(f, &constA_reg_set, constA_reg); } @@ -524,8 +905,69 @@ gen_logicop(const struct pipe_blend_state *blend, struct spe_function *f, int fragRGBA_reg, int fbRGBA_reg) { - /* XXX to-do */ - /* operate on 32-bit packed pixels, not float colors */ + /* We've got four 32-bit RGBA packed pixels in each of + * fragRGBA_reg and fbRGBA_reg, not sets of floating-point + * reds, greens, blues, and alphas. + * */ + ASSERT(blend->logicop_enable); + + switch(blend->logicop_func) { + case PIPE_LOGICOP_CLEAR: /* 0 */ + spe_zero(f, fragRGBA_reg); + break; + case PIPE_LOGICOP_NOR: /* ~(s | d) */ + spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */ + /* andc R, A, B computes R = A & ~B */ + spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_COPY_INVERTED: /* ~s */ + spe_complement(f, fragRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */ + /* andc R, A, B computes R = A & ~B */ + spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_INVERT: /* ~d */ + /* Note that (A nor A) == ~(A|A) == ~A */ + spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_XOR: /* s ^ d */ + spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_NAND: /* ~(s & d) */ + spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_AND: /* s & d */ + spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */ + spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + spe_complement(f, fragRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_NOOP: /* d */ + spe_move(f, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */ + /* orc R, A, B computes R = A | ~B */ + spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_COPY: /* s */ + break; + case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */ + /* orc R, A, B computes R = A | ~B */ + spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_OR: /* s | d */ + spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_SET: /* 1 */ + spe_load_int(f, fragRGBA_reg, 0xffffffff); + break; + default: + ASSERT(0); + } } @@ -534,14 +976,84 @@ gen_colormask(uint colormask, struct spe_function *f, int fragRGBA_reg, int fbRGBA_reg) { - /* XXX to-do */ - /* operate on 32-bit packed pixels, not float colors */ -} + /* We've got four 32-bit RGBA packed pixels in each of + * fragRGBA_reg and fbRGBA_reg, not sets of floating-point + * reds, greens, blues, and alphas. + * */ + + /* The color mask operation can prevent any set of color + * components in the incoming fragment from being written to the frame + * buffer; we do this by replacing the masked components of the + * fragment with the frame buffer values. + * + * There are only 16 possibilities, with a unique mask for + * each of the possibilities. (Technically, there are only 15 + * possibilities, since we shouldn't be called for the one mask + * that does nothing, but the complete implementation is here + * anyway to avoid confusion.) + * + * We implement this via a constant static array which we'll index + * into to get the correct mask. + * + * We're dependent on the mask values being low-order bits, + * with particular values for each bit; so we start with a + * few assertions, which will fail if any of the values were + * to change. + */ + ASSERT(PIPE_MASK_R == 0x1); + ASSERT(PIPE_MASK_G == 0x2); + ASSERT(PIPE_MASK_B == 0x4); + ASSERT(PIPE_MASK_A == 0x8); + /* Here's the list of all possible colormasks, indexed by the + * value of the combined mask specifier. + */ + static const unsigned int colormasks[16] = { + 0x00000000, /* 0: all colors masked */ + 0xff000000, /* 1: PIPE_MASK_R */ + 0x00ff0000, /* 2: PIPE_MASK_G */ + 0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */ + 0x0000ff00, /* 4: PIPE_MASK_B */ + 0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */ + 0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */ + 0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */ + 0x000000ff, /* 8: PIPE_MASK_A */ + 0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */ + 0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */ + 0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */ + 0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */ + 0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */ + 0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */ + 0xffffffff /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */ + }; + + /* Get a temporary register to hold the mask */ + int colormask_reg = spe_allocate_available_register(f); + + /* Look up the desired mask directly and load it into the mask register. + * This will load the same mask into each of the four words in the + * mask register. + */ + spe_load_uint(f, colormask_reg, colormasks[colormask]); + + /* Use the mask register to select between the fragment color + * values and the frame buffer color values. Wherever the + * mask has a 0 bit, the current frame buffer color should override + * the fragment color. Wherever the mask has a 1 bit, the + * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM) + * instruction will select bits from its first operand rA wherever the + * the mask bits rM are 0, and from its second operand rB wherever the + * mask bits rM are 1. That means that the frame buffer color is the + * first operand, and the fragment color the second. + */ + spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg); + /* Release the temporary register and we're done */ + spe_release_register(f, colormask_reg); +} /** - * Generate code to pack a quad of float colors into a four 32-bit integers. + * Generate code to pack a quad of float colors into four 32-bit integers. * * \param f SPE function to append instruction onto. * \param color_format the dest color packing format @@ -557,13 +1069,16 @@ gen_pack_colors(struct spe_function *f, int r_reg, int g_reg, int b_reg, int a_reg, int rgba_reg) { + int rg_reg = spe_allocate_available_register(f); + int ba_reg = spe_allocate_available_register(f); + /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ spe_cfltu(f, r_reg, r_reg, 32); spe_cfltu(f, g_reg, g_reg, 32); spe_cfltu(f, b_reg, b_reg, 32); spe_cfltu(f, a_reg, a_reg, 32); - /* Shift the most significant bytes to least the significant positions. + /* Shift the most significant bytes to the least significant positions. * I.e.: reg = reg >> 24 */ spe_rotmi(f, r_reg, r_reg, -24); @@ -595,9 +1110,12 @@ gen_pack_colors(struct spe_function *f, * OR-ing all those together gives us four packed colors: * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} */ - spe_or(f, rgba_reg, r_reg, g_reg); - spe_or(f, rgba_reg, rgba_reg, b_reg); - spe_or(f, rgba_reg, rgba_reg, a_reg); + spe_or(f, rg_reg, r_reg, g_reg); + spe_or(f, ba_reg, a_reg, b_reg); + spe_or(f, rgba_reg, rg_reg, ba_reg); + + spe_release_register(f, rg_reg); + spe_release_register(f, ba_reg); } @@ -629,6 +1147,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) const struct pipe_depth_stencil_alpha_state *dsa = &cell->depth_stencil->base; const struct pipe_blend_state *blend = &cell->blend->base; + const struct pipe_blend_color *blend_color = &cell->blend_color; const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ @@ -652,6 +1171,13 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */ spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_print_code(f, true); + spe_indent(f, 8); + spe_comment(f, -4, "Begin per-fragment ops"); + } + spe_allocate_register(f, x_reg); spe_allocate_register(f, y_reg); spe_allocate_register(f, color_tile_reg); @@ -710,34 +1236,50 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_release_register(f, mask_reg); /* OK, fbZ_reg has four 24-bit Z values now */ } + else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + spe_rotmi(f, fbZ_reg, fbZS_reg, -8); /* fbZ = fbZS >> 8 */ + /* OK, fbZ_reg has four 24-bit Z values now */ + } + else if (zs_format == PIPE_FORMAT_Z32_UNORM) { + spe_move(f, fbZ_reg, fbZS_reg); + /* OK, fbZ_reg has four 32-bit Z values now */ + } + else if (zs_format == PIPE_FORMAT_Z16_UNORM) { + spe_move(f, fbZ_reg, fbZS_reg); + /* OK, fbZ_reg has four 16-bit Z values now */ + } else { - /* XXX handle other z/stencil formats */ - ASSERT(0); + ASSERT(0); /* invalid format */ } - /* Convert fragZ values from float[4] to uint[4] */ + /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */ if (zs_format == PIPE_FORMAT_S8Z24_UNORM || zs_format == PIPE_FORMAT_X8Z24_UNORM || zs_format == PIPE_FORMAT_Z24S8_UNORM || zs_format == PIPE_FORMAT_Z24X8_UNORM) { - /* 24-bit Z values */ - int scale_reg = spe_allocate_available_register(f); - - /* scale_reg[0,1,2,3] = float(2^24-1) */ - spe_load_float(f, scale_reg, (float) 0xffffff); - - /* XXX these two instructions might be combined */ - spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */ - - spe_release_register(f, scale_reg); + /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + /* fragZ = fragZ >> 8 */ + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); } - else { - /* XXX handle 16-bit Z format */ - ASSERT(0); + else if (zs_format == PIPE_FORMAT_Z32_UNORM) { + /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); } + else if (zs_format == PIPE_FORMAT_Z16_UNORM) { + /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + /* fragZ = fragZ >> 16 */ + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); + } + } + else { + /* no Z test, but set Z to zero so we don't OR-in garbage below */ + spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */ } + if (dsa->stencil[0].enabled) { /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */ if (zs_format == PIPE_FORMAT_S8Z24_UNORM || @@ -751,7 +1293,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) ASSERT(0); } } - + else { + /* no stencil test, but set to zero so we don't OR-in garbage below */ + spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */ + } if (dsa->stencil[0].enabled) { /* XXX this may involve depth testing too */ @@ -779,22 +1324,22 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } - else if (zs_format == PIPE_FORMAT_S8Z24_UNORM || - zs_format == PIPE_FORMAT_X8Z24_UNORM) { - /* XXX to do */ - ASSERT(0); + else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } + else if (zs_format == PIPE_FORMAT_Z32_UNORM) { + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - /* XXX to do */ - ASSERT(0); + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_S8_UNORM) { - /* XXX to do */ - ASSERT(0); + ASSERT(0); /* XXX to do */ } else { - /* bad zs_format */ - ASSERT(0); + ASSERT(0); /* bad zs_format */ } /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */ @@ -816,7 +1361,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) if (blend->blend_enable) { - gen_blend(blend, f, color_format, + gen_blend(blend, blend_color, f, color_format, fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg); } @@ -837,7 +1382,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) gen_logicop(blend, f, rgba_reg, fbRGBA_reg); } - if (blend->colormask != 0xf) { + if (blend->colormask != PIPE_MASK_RGBA) { gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg); } @@ -866,5 +1411,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_release_register(f, fbRGBA_reg); spe_release_register(f, fbZS_reg); spe_release_register(f, quad_offset_reg); -} + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_comment(f, -4, "End per-fragment ops"); + } +} diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c index 475c6ef0ce..ea820aca74 100644 --- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c +++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c @@ -72,7 +72,9 @@ cell_delete_blend_state(struct pipe_context *pipe, void *blend) { struct cell_blend_state *cb = (struct cell_blend_state *) blend; +#if 0 spe_release_func(& cb->code); +#endif FREE(cb); } @@ -128,7 +130,9 @@ cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth) struct cell_depth_stencil_alpha_state *cdsa = (struct cell_depth_stencil_alpha_state *) depth; +#if 0 spe_release_func(& cdsa->code); +#endif FREE(cdsa); } diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c index 139b3719b6..47ba6fa290 100644 --- a/src/gallium/drivers/cell/ppu/cell_screen.c +++ b/src/gallium/drivers/cell/ppu/cell_screen.c @@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param) case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS: return CELL_MAX_SAMPLERS; case PIPE_CAP_NPOT_TEXTURES: - return 0; + return 1; case PIPE_CAP_TWO_SIDED_STENCIL: - return 0; + return 1; case PIPE_CAP_GLSL: return 1; case PIPE_CAP_S3TC: @@ -68,13 +68,13 @@ cell_get_param(struct pipe_screen *screen, int param) case PIPE_CAP_ANISOTROPIC_FILTER: return 0; case PIPE_CAP_POINT_SPRITE: - return 0; + return 1; case PIPE_CAP_MAX_RENDER_TARGETS: return 1; case PIPE_CAP_OCCLUSION_QUERY: - return 0; + return 1; case PIPE_CAP_TEXTURE_SHADOW_MAP: - return 0; + return 10; case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: return 12; /* max 2Kx2K */ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: @@ -82,7 +82,7 @@ cell_get_param(struct pipe_screen *screen, int param) case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: return 12; /* max 2Kx2K */ default: - return 0; + return 10; } } @@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param) return 16.0; /* arbitrary */ default: - return 0; + return 10; } } diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 2da3097983..8a389cd6aa 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -100,14 +100,19 @@ cell_emit_state(struct cell_context *cell) = cell_batch_alloc(cell, sizeof(*fops)); struct spe_function spe_code; + /* Prepare the buffer that will hold the generated code. */ + spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + /* generate new code */ cell_gen_fragment_function(cell, &spe_code); + /* put the new code into the batch buffer */ fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; memcpy(&fops->code, spe_code.store, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); fops->dsa = cell->depth_stencil->base; fops->blend = cell->blend->base; + /* free codegen buffer */ spe_release_func(&spe_code); } diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore new file mode 100644 index 0000000000..2be9a2d324 --- /dev/null +++ b/src/gallium/drivers/cell/spu/.gitignore @@ -0,0 +1 @@ +g3d_spu diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 78260c4259..b4d30228f7 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -50,7 +50,31 @@ helpful headers: /opt/cell/sdk/usr/include/libmisc.h */ +/* Set to 0 to disable all extraneous debugging code */ +#define DEBUG 1 + +#if DEBUG boolean Debug = FALSE; +boolean force_fragment_ops_fallback = TRUE; + +/* These debug macros use the unusual construction ", ##__VA_ARGS__" + * which expands to the expected comma + args if variadic arguments + * are supplied, but swallows the comma if there are no variadic + * arguments (which avoids syntax errors that would otherwise occur). + */ +#define DEBUG_PRINTF(format,...) \ + if (Debug) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) +#define D_PRINTF(flag, format,...) \ + if (spu.init.debug_flags & (flag)) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) + +#else + +#define DEBUG_PRINTF(...) +#define D_PRINTF(...) + +#endif struct spu_global spu; @@ -133,9 +157,7 @@ really_clear_tiles(uint surfaceIndex) static void cmd_clear_surface(const struct cell_command_clear_surface *clear) { - if (Debug) - printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id, - clear->surface, clear->value); + DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); if (clear->surface == 0) { spu.fb.color_clear_value = clear->value; @@ -203,17 +225,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) #endif /* CLEAR_OPT */ - if (Debug) - printf("SPU %u: CLEAR SURF done\n", spu.init.id); + DEBUG_PRINTF("CLEAR SURF done\n"); } static void cmd_release_verts(const struct cell_command_release_verts *release) { - if (Debug) - printf("SPU %u: RELEASE VERTS %u\n", - spu.init.id, release->vertex_buf); + DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf); ASSERT(release->vertex_buf != ~0U); release_buffer(release->vertex_buf); } @@ -228,16 +247,38 @@ cmd_release_verts(const struct cell_command_release_verts *release) static void cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) { - if (Debug) - printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); + static int warned = 0; + + DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); /* Copy state info (for fallback case only) */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); - /* Point function pointer at new code */ - spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; + /* Parity twist! For now, always use the fallback code by default, + * only switching to codegen when specifically requested. This + * allows us to develop freely without risking taking down the + * branch. + * + * Later, the parity of this check will be reversed, so that + * codegen is *always* used, unless we specifically indicate that + * we don't want it. + * + * Eventually, the option will be removed completely, because in + * final code we'll always use codegen and won't even provide the + * raw state records that the fallback code requires. + */ + if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) { + spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; + } + else { + /* otherwise, the default fallback code remains in place */ + if (!warned) { + fprintf(stderr, "Cell Warning: using fallback per-fragment code\n"); + warned = 1; + } + } spu.read_depth = spu.depth_stencil_alpha.depth.enabled; spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; @@ -247,8 +288,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) static void cmd_state_fragment_program(const struct cell_command_fragment_program *fp) { - if (Debug) - printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id); + DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_program_code, fp->code, SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); @@ -262,9 +302,7 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp) static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { - if (Debug) - printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", - spu.init.id, + DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", cmd->width, cmd->height, cmd->color_start, @@ -309,9 +347,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) static void cmd_state_sampler(const struct cell_command_sampler *sampler) { - if (Debug) - printf("SPU %u: SAMPLER [%u]\n", - spu.init.id, sampler->unit); + DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit); spu.sampler[sampler->unit] = sampler->state; if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) @@ -328,11 +364,9 @@ cmd_state_texture(const struct cell_command_texture *texture) const uint width = texture->width; const uint height = texture->height; - if (Debug) { - printf("SPU %u: TEXTURE [%u] at %p size %u x %u\n", spu.init.id, + DEBUG_PRINTF("TEXTURE [%u] at %p size %u x %u\n", texture->unit, texture->start, texture->width, texture->height); - } spu.texture[unit].start = texture->start; spu.texture[unit].width = width; @@ -351,10 +385,7 @@ cmd_state_texture(const struct cell_command_texture *texture) static void cmd_state_vertex_info(const struct vertex_info *vinfo) { - if (Debug) { - printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id, - vinfo->num_attribs); - } + DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); ASSERT(vinfo->num_attribs >= 1); ASSERT(vinfo->num_attribs <= 8); memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); @@ -393,8 +424,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code) static void cmd_finish(void) { - if (Debug) - printf("SPU %u: FINISH\n", spu.init.id); + DEBUG_PRINTF("FINISH\n"); really_clear_tiles(0); /* wait for all outstanding DMAs to finish */ mfc_write_tag_mask(~0); @@ -419,9 +449,8 @@ cmd_batch(uint opcode) const unsigned usize = size / sizeof(buffer[0]); uint pos; - if (Debug) - printf("SPU %u: BATCH buffer %u, len %u, from %p\n", - spu.init.id, buf, size, spu.init.buffers[buf]); + DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n", + buf, size, spu.init.buffers[buf]); ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); @@ -440,8 +469,7 @@ cmd_batch(uint opcode) wait_on_mask(1 << TAG_BATCH_BUFFER); /* Tell PPU we're done copying the buffer to local store */ - if (Debug) - printf("SPU %u: release batch buf %u\n", spu.init.id, buf); + DEBUG_PRINTF("release batch buf %u\n", buf); release_buffer(buf); /* @@ -571,8 +599,7 @@ cmd_batch(uint opcode) } } - if (Debug) - printf("SPU %u: BATCH complete\n", spu.init.id); + DEBUG_PRINTF("BATCH complete\n"); } @@ -585,8 +612,7 @@ main_loop(void) struct cell_command cmd; int exitFlag = 0; - if (Debug) - printf("SPU %u: Enter main loop\n", spu.init.id); + DEBUG_PRINTF("Enter main loop\n"); ASSERT((sizeof(struct cell_command) & 0xf) == 0); ASSERT_ALIGN16(&cmd); @@ -595,14 +621,12 @@ main_loop(void) unsigned opcode; int tag = 0; - if (Debug) - printf("SPU %u: Wait for cmd...\n", spu.init.id); + DEBUG_PRINTF("Wait for cmd...\n"); /* read/wait from mailbox */ opcode = (unsigned int) spu_read_in_mbox(); - if (Debug) - printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode); + DEBUG_PRINTF("got cmd 0x%x\n", opcode); /* command payload */ mfc_get(&cmd, /* dest */ @@ -619,8 +643,7 @@ main_loop(void) switch (opcode & CELL_CMD_OPCODE_MASK) { case CELL_CMD_EXIT: - if (Debug) - printf("SPU %u: EXIT\n", spu.init.id); + DEBUG_PRINTF("EXIT\n"); exitFlag = 1; break; case CELL_CMD_VS_EXECUTE: @@ -632,13 +655,12 @@ main_loop(void) cmd_batch(opcode); break; default: - printf("Bad opcode!\n"); + printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK); } } - if (Debug) - printf("SPU %u: Exit main loop\n", spu.init.id); + DEBUG_PRINTF("Exit main loop\n"); spu_dcache_report(); } @@ -653,7 +675,8 @@ one_time_init(void) invalidate_tex_cache(); /* Install default/fallback fragment processing function. - * This will normally be overriden by a code-gen'd function. + * This will normally be overriden by a code-gen'd function + * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set. */ spu.fragment_ops = spu_fallback_fragment_ops; } @@ -682,11 +705,13 @@ main(main_param_t speid, main_param_t argp) ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4); ASSERT(sizeof(struct cell_command_render) % 8 == 0); + ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0); + ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0); one_time_init(); - if (Debug) - printf("SPU: main() speid=%lu\n", (unsigned long) speid); + DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid); + D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n"); mfc_get(&spu.init, /* dest */ (unsigned int) argp, /* src */ @@ -698,7 +723,7 @@ main(main_param_t speid, main_param_t argp) #if 0 if (spu.init.id==0) - spu_test_misc(); + spu_test_misc(spu.init.id); #endif main_loop(); diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 2c7b625840..72e540fcff 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -143,13 +143,13 @@ struct spu_global ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; - /** Current fragment ops machine code */ - uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS]; + /** Current fragment ops machine code, at 32-byte boundary */ + uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB; /** Current fragment ops function */ spu_fragment_ops_func fragment_ops; - /** Current fragment program machine code */ - uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; + /** Current fragment program machine code, at 32-byte boundary */ + uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB; /** Current fragment ops function */ spu_fragment_program_func fragment_program; diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index 03dd547845..f107764fb2 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -60,9 +60,12 @@ spu_fallback_fragment_ops(uint x, uint y, vector unsigned int mask) { vector float frag_aos[4]; - unsigned int c0, c1, c2, c3; + unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */ + unsigned int fragc0, fragc1, fragc2, fragc3; /* fragment colors */ - /* do alpha test */ + /* + * Do alpha test + */ if (spu.depth_stencil_alpha.alpha.enabled) { vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref); vector unsigned int amask; @@ -102,7 +105,10 @@ spu_fallback_fragment_ops(uint x, uint y, mask = spu_and(mask, amask); } - /* Z and/or stencil testing... */ + + /* + * Z and/or stencil testing... + */ if (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled) { @@ -178,6 +184,32 @@ spu_fallback_fragment_ops(uint x, uint y, } } + + /* + * If we'll need the current framebuffer/tile colors for blending + * or logicop or colormask, fetch them now. + */ + if (spu.blend.blend_enable || + spu.blend.logicop_enable || + spu.blend.colormask != 0xf) { + +#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */ + fbc0 = colorTile->ui[y][x*2+0]; + fbc1 = colorTile->ui[y][x*2+1]; + fbc2 = colorTile->ui[y][x*2+2]; + fbc3 = colorTile->ui[y][x*2+3]; +#else + fbc0 = colorTile->ui[y+0][x+0]; + fbc1 = colorTile->ui[y+0][x+1]; + fbc2 = colorTile->ui[y+1][x+0]; + fbc3 = colorTile->ui[y+1][x+1]; +#endif + } + + + /* + * Do blending + */ if (spu.blend.blend_enable) { /* blending terms, misc regs */ vector float term1r, term1g, term1b, term1a; @@ -186,39 +218,26 @@ spu_fallback_fragment_ops(uint x, uint y, vector float fbRGBA[4]; /* current framebuffer colors */ - /* get colors from framebuffer/tile */ + /* convert framebuffer colors from packed int to vector float */ { - vector float fc[4]; - uint c0, c1, c2, c3; - -#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */ - c0 = colorTile->ui[y][x*2+0]; - c1 = colorTile->ui[y][x*2+1]; - c2 = colorTile->ui[y][x*2+2]; - c3 = colorTile->ui[y][x*2+3]; -#else - c0 = colorTile->ui[y+0][x+0]; - c1 = colorTile->ui[y+0][x+1]; - c2 = colorTile->ui[y+1][x+0]; - c3 = colorTile->ui[y+1][x+1]; -#endif + vector float temp[4]; /* float colors in AOS form */ switch (spu.fb.color_format) { case PIPE_FORMAT_B8G8R8A8_UNORM: - fc[0] = spu_unpack_B8G8R8A8(c0); - fc[1] = spu_unpack_B8G8R8A8(c1); - fc[2] = spu_unpack_B8G8R8A8(c2); - fc[3] = spu_unpack_B8G8R8A8(c3); + temp[0] = spu_unpack_B8G8R8A8(fbc0); + temp[1] = spu_unpack_B8G8R8A8(fbc1); + temp[2] = spu_unpack_B8G8R8A8(fbc2); + temp[3] = spu_unpack_B8G8R8A8(fbc3); break; case PIPE_FORMAT_A8R8G8B8_UNORM: - fc[0] = spu_unpack_A8R8G8B8(c0); - fc[1] = spu_unpack_A8R8G8B8(c1); - fc[2] = spu_unpack_A8R8G8B8(c2); - fc[3] = spu_unpack_A8R8G8B8(c3); + temp[0] = spu_unpack_A8R8G8B8(fbc0); + temp[1] = spu_unpack_A8R8G8B8(fbc1); + temp[2] = spu_unpack_A8R8G8B8(fbc2); + temp[3] = spu_unpack_A8R8G8B8(fbc3); break; default: ASSERT(0); } - _transpose_matrix4x4(fbRGBA, fc); + _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */ } /* @@ -384,21 +403,20 @@ spu_fallback_fragment_ops(uint x, uint y, #endif /* - * Pack float colors into 32-bit RGBA words. + * Pack fragment float colors into 32-bit RGBA words. */ switch (spu.fb.color_format) { case PIPE_FORMAT_A8R8G8B8_UNORM: - c0 = spu_pack_A8R8G8B8(frag_aos[0]); - c1 = spu_pack_A8R8G8B8(frag_aos[1]); - c2 = spu_pack_A8R8G8B8(frag_aos[2]); - c3 = spu_pack_A8R8G8B8(frag_aos[3]); + fragc0 = spu_pack_A8R8G8B8(frag_aos[0]); + fragc1 = spu_pack_A8R8G8B8(frag_aos[1]); + fragc2 = spu_pack_A8R8G8B8(frag_aos[2]); + fragc3 = spu_pack_A8R8G8B8(frag_aos[3]); break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - c0 = spu_pack_B8G8R8A8(frag_aos[0]); - c1 = spu_pack_B8G8R8A8(frag_aos[1]); - c2 = spu_pack_B8G8R8A8(frag_aos[2]); - c3 = spu_pack_B8G8R8A8(frag_aos[3]); + fragc0 = spu_pack_B8G8R8A8(frag_aos[0]); + fragc1 = spu_pack_B8G8R8A8(frag_aos[1]); + fragc2 = spu_pack_B8G8R8A8(frag_aos[2]); + fragc3 = spu_pack_B8G8R8A8(frag_aos[3]); break; default: fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n"); @@ -407,20 +425,57 @@ spu_fallback_fragment_ops(uint x, uint y, /* - * Color masking + * Do color masking */ if (spu.blend.colormask != 0xf) { - /* XXX to do */ - /* apply color mask to 32-bit packed colors */ + uint cmask = 0x0; /* each byte corresponds to a color channel */ + + /* Form bitmask depending on color buffer format and colormask bits */ + switch (spu.fb.color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + if (spu.blend.colormask & (1<<0)) + cmask |= 0x00ff0000; /* red */ + if (spu.blend.colormask & (1<<1)) + cmask |= 0x0000ff00; /* green */ + if (spu.blend.colormask & (1<<2)) + cmask |= 0x000000ff; /* blue */ + if (spu.blend.colormask & (1<<3)) + cmask |= 0xff000000; /* alpha */ + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + if (spu.blend.colormask & (1<<0)) + cmask |= 0x0000ff00; /* red */ + if (spu.blend.colormask & (1<<1)) + cmask |= 0x00ff0000; /* green */ + if (spu.blend.colormask & (1<<2)) + cmask |= 0xff000000; /* blue */ + if (spu.blend.colormask & (1<<3)) + cmask |= 0x000000ff; /* alpha */ + break; + default: + ASSERT(0); + } + + /* + * Apply color mask to the 32-bit packed colors. + * if (cmask[i]) + * frag color[i] = frag color[i]; + * else + * frag color[i] = framebuffer color[i]; + */ + fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask); + fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask); + fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask); + fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask); } /* - * Logic Ops + * Do logic ops */ if (spu.blend.logicop_enable) { /* XXX to do */ - /* apply logicop to 32-bit packed colors */ + /* apply logicop to 32-bit packed colors (fragcx and fbcx) */ } @@ -431,45 +486,46 @@ spu_fallback_fragment_ops(uint x, uint y, spu.cur_ctile_status = TILE_STATUS_DIRTY; } else { + /* write no fragments */ return; } /* - * Write new quad colors to the framebuffer/tile. + * Write new fragment/quad colors to the framebuffer/tile. * Only write pixels where the corresponding mask word is set. */ #if LINEAR_QUAD_LAYOUT /* * Quad layout: * +--+--+--+--+ - * |p0|p1|p2|p3| + * |p0|p1|p2|p3|... * +--+--+--+--+ */ if (spu_extract(mask, 0)) - colorTile->ui[y][x*2] = c0; + colorTile->ui[y][x*2] = fragc0; if (spu_extract(mask, 1)) - colorTile->ui[y][x*2+1] = c1; + colorTile->ui[y][x*2+1] = fragc1; if (spu_extract(mask, 2)) - colorTile->ui[y][x*2+2] = c2; + colorTile->ui[y][x*2+2] = fragc2; if (spu_extract(mask, 3)) - colorTile->ui[y][x*2+3] = c3; + colorTile->ui[y][x*2+3] = fragc3; #else /* * Quad layout: * +--+--+ - * |p0|p1| + * |p0|p1|... * +--+--+ - * |p2|p3| + * |p2|p3|... * +--+--+ */ if (spu_extract(mask, 0)) - colorTile->ui[y+0][x+0] = c0; + colorTile->ui[y+0][x+0] = fragc0; if (spu_extract(mask, 1)) - colorTile->ui[y+0][x+1] = c1; + colorTile->ui[y+0][x+1] = fragc1; if (spu_extract(mask, 2)) - colorTile->ui[y+1][x+0] = c2; + colorTile->ui[y+1][x+0] = fragc2; if (spu_extract(mask, 3)) - colorTile->ui[y+1][x+1] = c3; + colorTile->ui[y+1][x+1] = fragc3; #endif } diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 8b93878192..0a8fb56a62 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -241,6 +241,19 @@ eval_coeff(uint slot, float x, float y, vector float result[4]) } +/** + * As above, but return 4 vectors in SOA format. + * XXX this will all be re-written someday. + */ +static INLINE void +eval_coeff_soa(uint slot, float x, float y, vector float result[4]) +{ + eval_coeff(slot, x, y, result); + _transpose_matrix4x4(result, result); +} + + + static INLINE vector float eval_z(float x, float y) { @@ -267,14 +280,17 @@ emit_quad( int x, int y, mask_t mask ) if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; const int iy = y - setup.cliprect_miny; - vector float colors[4]; spu.cur_ctile_status = TILE_STATUS_DIRTY; spu.cur_ztile_status = TILE_STATUS_DIRTY; if (spu.texture[0].start) { - /* texture mapping */ + /* + * Temporary texture mapping path + * This will go away when fragment programs support TEX inst. + */ const uint unit = 0; + vector float colors[4]; vector float texcoords[4]; eval_coeff(2, (float) x, (float) y, texcoords); @@ -311,70 +327,60 @@ emit_quad( int x, int y, mask_t mask ) colors[3] = spu_mul(colors[3], colors1[3]); } + { + /* Convert fragment data from AoS to SoA format. + * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) + * This is temporary! + */ + vector float soa_frag[4]; + _transpose_matrix4x4(soa_frag, colors); + + vector float fragZ = eval_z((float) x, (float) y); + + /* Do all per-fragment/quad operations here, including: + * alpha test, z test, stencil test, blend and framebuffer writing. + */ + spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile, + fragZ, + soa_frag[0], soa_frag[1], + soa_frag[2], soa_frag[3], + mask); + } + } else { - /* simple shading */ -#if 0 - eval_coeff(1, (float) x, (float) y, colors); + /* + * Run fragment shader, execute per-fragment ops, update fb/tile. + */ + vector float inputs[4*4], outputs[2*4]; + vector float fragZ = eval_z((float) x, (float) y); + /* setup inputs */ +#if 0 + eval_coeff_soa(1, (float) x, (float) y, inputs); #else - /* XXX new fragment program code */ - - if (spu.fragment_program) { - vector float inputs[4*4], outputs[2*4]; - - /* setup inputs */ - eval_coeff(1, (float) x, (float) y, inputs); - - /* Execute the current fragment program */ - spu.fragment_program(inputs, outputs, spu.constants); - - /* Copy outputs */ - colors[0] = outputs[0*4+0]; - colors[1] = outputs[0*4+1]; - colors[2] = outputs[0*4+2]; - colors[3] = outputs[0*4+3]; - - if (0 && spu.init.id==0 && y == 48) { - printf("colors[0] = %f %f %f %f\n", - spu_extract(colors[0], 0), - spu_extract(colors[0], 1), - spu_extract(colors[0], 2), - spu_extract(colors[0], 3)); - printf("colors[1] = %f %f %f %f\n", - spu_extract(colors[1], 0), - spu_extract(colors[1], 1), - spu_extract(colors[1], 2), - spu_extract(colors[1], 3)); - } - + uint i; + for (i = 0; i < spu.vertex_info.num_attribs; i++) { + eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4); } #endif - } + ASSERT(spu.fragment_program); + ASSERT(spu.fragment_ops); + /* Execute the current fragment program */ + spu.fragment_program(inputs, outputs, spu.constants); - { - /* Convert fragment data from AoS to SoA format. - * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) - * This is temporary! - */ - vector float soa_frag[4]; - _transpose_matrix4x4(soa_frag, colors); - - float4 fragZ; - - fragZ.v = eval_z((float) x, (float) y); - - /* Do all per-fragment/quad operations here, including: - * alpha test, z test, stencil test, blend and framebuffer writing. + /* Execute per-fragment/quad operations, including: + * alpha test, z test, stencil test, blend and framebuffer writing. */ spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile, - fragZ.v, - soa_frag[0], soa_frag[1], - soa_frag[2], soa_frag[3], + fragZ, + outputs[0*4+0], + outputs[0*4+1], + outputs[0*4+2], + outputs[0*4+3], mask); } - } } diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c index 701ee4c72f..4fea71c314 100644 --- a/src/gallium/drivers/softpipe/sp_fs_exec.c +++ b/src/gallium/drivers/softpipe/sp_fs_exec.c @@ -39,11 +39,20 @@ #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_parse.h" -struct sp_exec_fragment_shader { +struct sp_exec_fragment_shader +{ struct sp_fragment_shader base; + const struct tgsi_token *machine_tokens; }; +/** cast wrapper */ +static INLINE struct sp_exec_fragment_shader * +sp_exec_fragment_shader(const struct sp_fragment_shader *base) +{ + return (struct sp_exec_fragment_shader *) base; +} + /** * Compute quad X,Y,Z,W for the four fragments in a quad. @@ -86,10 +95,20 @@ exec_prepare( const struct sp_fragment_shader *base, struct tgsi_exec_machine *machine, struct tgsi_sampler *samplers ) { - tgsi_exec_machine_bind_shader( machine, - base->shader.tokens, - PIPE_MAX_SAMPLERS, - samplers ); + struct sp_exec_fragment_shader *spefs = + sp_exec_fragment_shader(base); + + /* + * Bind tokens/shader to the interpreter's machine state. + * Avoid redundant binding. + */ + if (spefs->machine_tokens != base->shader.tokens) { + tgsi_exec_machine_bind_shader( machine, + base->shader.tokens, + PIPE_MAX_SAMPLERS, + samplers ); + spefs->machine_tokens = base->shader.tokens; + } } diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h index 4d64c74a4a..1e702c7fa7 100644 --- a/src/gallium/include/pipe/p_compiler.h +++ b/src/gallium/include/pipe/p_compiler.h @@ -144,10 +144,12 @@ typedef unsigned char boolean; #define ALIGN16_DECL(TYPE, NAME, SIZE) TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) )) #define ALIGN16_ASSIGN(NAME) NAME##___aligned #define ALIGN16_ATTRIB __attribute__(( aligned( 16 ) )) +#define ALIGN32_ATTRIB __attribute__(( aligned( 32 ) )) #else #define ALIGN16_DECL(TYPE, NAME, SIZE) TYPE NAME##___unaligned[SIZE + 1] #define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned) #define ALIGN16_ATTRIB +#define ALIGN32_ATTRIB #endif diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h index b15affef7a..3bedc75294 100644 --- a/src/gallium/include/pipe/p_screen.h +++ b/src/gallium/include/pipe/p_screen.h @@ -26,6 +26,8 @@ **************************************************************************/ /** + * @file + * * Screen, Adapter or GPU * * These are driver functions/facilities that are context independent. diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h index a562e3a6b1..78c20de3e2 100644 --- a/src/gallium/include/pipe/p_shader_tokens.h +++ b/src/gallium/include/pipe/p_shader_tokens.h @@ -1,4 +1,31 @@ -#if !defined TGSI_TOKEN_H +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef TGSI_TOKEN_H #define TGSI_TOKEN_H #ifdef __cplusplus @@ -396,7 +423,7 @@ struct tgsi_immediate_float32 #define TGSI_SAT_ZERO_ONE 1 /* clamp to [0,1] */ #define TGSI_SAT_MINUS_PLUS_ONE 2 /* clamp to [-1,1] */ -/* +/** * Opcode is the operation code to execute. A given operation defines the * semantics how the source registers (if any) are interpreted and what is * written to the destination registers (if any) as a result of execution. @@ -481,7 +508,7 @@ struct tgsi_instruction_ext #define TGSI_SWIZZLE_Z 2 #define TGSI_SWIZZLE_W 3 -/* +/** * Precision controls the precision at which the operation should be executed. * * CondDstUpdate enables condition code register writes. When this field is @@ -548,7 +575,7 @@ struct tgsi_instruction_ext_predicate unsigned Extended : 1; /* BOOL */ }; -/* +/** * File specifies the register array to access. * * Index specifies the element number of a register in the register file. @@ -580,7 +607,7 @@ struct tgsi_src_register unsigned Extended : 1; /* BOOL */ }; -/* +/** * If tgsi_src_register::Extended is TRUE, tgsi_src_register_ext follows. * * Then, if tgsi_src_register::Indirect is TRUE, another tgsi_src_register @@ -599,7 +626,7 @@ struct tgsi_src_register_ext unsigned Extended : 1; /* BOOL */ }; -/* +/** * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ, * it should be cast to tgsi_src_register_ext_swz. * @@ -617,7 +644,7 @@ struct tgsi_src_register_ext #define TGSI_EXTSWIZZLE_ZERO 4 #define TGSI_EXTSWIZZLE_ONE 5 -/* +/** * ExtSwizzleX, ExtSwizzleY, ExtSwizzleZ and ExtSwizzleW swizzle the source * register in an extended manner. * diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index da783389da..342f17260a 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -27,6 +27,8 @@ /** + * @file + * * Abstract graphics pipe state objects. * * Basic notes: diff --git a/src/gallium/include/pipe/p_thread.h b/src/gallium/include/pipe/p_thread.h index e01d5a602b..8af3cd958b 100644 --- a/src/gallium/include/pipe/p_thread.h +++ b/src/gallium/include/pipe/p_thread.h @@ -25,6 +25,8 @@ /** + * @file + * * Thread, mutex, condition var and thread-specific data functions. */ diff --git a/src/gallium/winsys/drm/intel/dri/intel_screen.c b/src/gallium/winsys/drm/intel/dri/intel_screen.c index 3a486481f5..78b9a6db05 100644 --- a/src/gallium/winsys/drm/intel/dri/intel_screen.c +++ b/src/gallium/winsys/drm/intel/dri/intel_screen.c @@ -113,7 +113,118 @@ static PFNGLXCREATECONTEXTMODES create_context_modes = NULL; extern const struct dri_extension card_extensions[]; +static GLboolean +intel_get_param(__DRIscreenPrivate *psp, int param, int *value) +{ + int ret; + struct drm_i915_getparam gp; + + gp.param = param; + gp.value = value; + + ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp)); + if (ret) { + fprintf(stderr, "drm_i915_getparam: %d\n", ret); + return GL_FALSE; + } + + return GL_TRUE; +} + +static void +intelSetTexOffset(__DRIcontext *pDRICtx, int texname, + unsigned long long offset, int depth, uint pitch) +{ + abort(); +#if 0 + struct intel_context *intel = (struct intel_context*) + ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate; + struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname); + struct st_texture_object *stObj = st_texture_object(tObj); + + if (!stObj) + return; + + if (stObj->pt) + st->pipe->texture_release(intel->st->pipe, &stObj->pt); + + stObj->imageOverride = GL_TRUE; + stObj->depthOverride = depth; + stObj->pitchOverride = pitch; + + if (offset) + stObj->textureOffset = offset; +#endif +} + + +#if 0 +static void +intelHandleDrawableConfig(__DRIdrawablePrivate *dPriv, + __DRIcontextPrivate *pcp, + __DRIDrawableConfigEvent *event) +{ + (void) dPriv; + (void) pcp; + (void) event; +} +#endif + +#if 0 +static void +intelHandleBufferAttach(__DRIdrawablePrivate *dPriv, + __DRIcontextPrivate *pcp, + __DRIBufferAttachEvent *ba) +{ + struct intel_screen *intelScreen = intel_screen(dPriv->driScreenPriv); + + switch (ba->buffer.attachment) { + case DRI_DRAWABLE_BUFFER_FRONT_LEFT: + intelScreen->front.width = dPriv->w; + intelScreen->front.height = dPriv->h; + intelScreen->front.cpp = ba->buffer.cpp; + intelScreen->front.pitch = ba->buffer.pitch; + driGenBuffers(intelScreen->base.staticPool, "front", 1, &intelScreen->front.buffer, 0, 0, 0); + driBOSetReferenced(intelScreen->front.buffer, ba->buffer.handle); + break; + + case DRI_DRAWABLE_BUFFER_BACK_LEFT: + case DRI_DRAWABLE_BUFFER_DEPTH: + case DRI_DRAWABLE_BUFFER_STENCIL: + case DRI_DRAWABLE_BUFFER_ACCUM: + /* anything ?? */ + break; + + default: + fprintf(stderr, "unhandled buffer attach event, attachment type %d\n", + ba->buffer.attachment); + return; + } +} +#endif + +static const __DRItexOffsetExtension intelTexOffsetExtension = { + { __DRI_TEX_OFFSET }, + intelSetTexOffset, +}; + +#if 0 +static const __DRItexBufferExtension intelTexBufferExtension = { + { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION }, + intelSetTexBuffer, +}; +#endif +static const __DRIextension *intelScreenExtensions[] = { + &driReadDrawableExtension, + &driCopySubBufferExtension.base, + &driSwapControlExtension.base, + &driFrameTrackingExtension.base, + &driMediaStreamCounterExtension.base, + &intelTexOffsetExtension.base, +// &intelTexBufferExtension.base, + NULL +}; static void @@ -232,7 +343,8 @@ intelCreatePools(__DRIscreenPrivate * sPriv) intelScreen->havePools = GL_TRUE; - intelUpdateScreenRotation(sPriv, intelScreen->sarea); + if (intelScreen->sarea) + intelUpdateScreenRotation(sPriv, intelScreen->sarea); return GL_TRUE; } @@ -265,11 +377,6 @@ intelInitDriver(__DRIscreenPrivate * sPriv) struct intel_screen *intelScreen; I830DRIPtr gDRIPriv = (I830DRIPtr) sPriv->pDevPriv; - PFNGLXSCRENABLEEXTENSIONPROC glx_enable_extension = - (PFNGLXSCRENABLEEXTENSIONPROC) (*dri_interface-> - getProcAddress("glxEnableExtension")); - void *const psc = sPriv->psc->screenConfigs; - if (sPriv->devPrivSize != sizeof(I830DRIRec)) { fprintf(stderr, "\nERROR! sizeof(I830DRIRec) does not match passed size from device driver\n"); @@ -286,28 +393,19 @@ intelInitDriver(__DRIscreenPrivate * sPriv) __driConfigOptions, __driNConfigOptions); sPriv->private = (void *) intelScreen; - intelScreen->sarea = (drmI830Sarea *) (((GLubyte *) sPriv->pSAREA) + - gDRIPriv->sarea_priv_offset); - intelScreen->deviceID = gDRIPriv->deviceID; - intelScreen->front.cpp = gDRIPriv->cpp; - intelScreen->drmMinor = sPriv->drmMinor; + gDRIPriv->sarea_priv_offset); - assert(gDRIPriv->bitsPerPixel == 16 || - gDRIPriv->bitsPerPixel == 32); + intelScreen->deviceID = gDRIPriv->deviceID; + intelScreen->front.cpp = gDRIPriv->cpp; + intelScreen->drmMinor = sPriv->drm_version.minor; intelUpdateScreenRotation(sPriv, intelScreen->sarea); if (0) intelPrintDRIInfo(intelScreen, sPriv, gDRIPriv); - if (glx_enable_extension != NULL) { - (*glx_enable_extension) (psc, "GLX_SGI_swap_control"); - (*glx_enable_extension) (psc, "GLX_SGI_video_sync"); - (*glx_enable_extension) (psc, "GLX_MESA_swap_control"); - (*glx_enable_extension) (psc, "GLX_MESA_swap_frame_usage"); - (*glx_enable_extension) (psc, "GLX_SGI_make_current_read"); - } + sPriv->extensions = intelScreenExtensions; intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer; intelScreen->base.base.get_name = intel_get_name; @@ -406,65 +504,19 @@ intelGetSwapInfo(__DRIdrawablePrivate * dPriv, __DRIswapInfo * sInfo) return 0; } - -static void -intelSetTexOffset(__DRIcontext *pDRICtx, int texname, - unsigned long long offset, int depth, uint pitch) -{ - abort(); -#if 0 - struct intel_context *intel = (struct intel_context*) - ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate; - struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname); - struct st_texture_object *stObj = st_texture_object(tObj); - - if (!stObj) - return; - - if (stObj->pt) - st->pipe->texture_release(intel->st->pipe, &stObj->pt); - - stObj->imageOverride = GL_TRUE; - stObj->depthOverride = depth; - stObj->pitchOverride = pitch; - - if (offset) - stObj->textureOffset = offset; -#endif -} - - -static const struct __DriverAPIRec intelAPI = { - .InitDriver = intelInitDriver, - .DestroyScreen = intelDestroyScreen, - .CreateContext = intelCreateContext, - .DestroyContext = intelDestroyContext, - .CreateBuffer = intelCreateBuffer, - .DestroyBuffer = intelDestroyBuffer, - .SwapBuffers = intelSwapBuffers, - .MakeCurrent = intelMakeCurrent, - .UnbindContext = intelUnbindContext, - .GetSwapInfo = intelGetSwapInfo, - .GetMSC = driGetMSC32, - .WaitForMSC = driWaitForMSC32, - .WaitForSBC = NULL, - .SwapBuffersMSC = NULL, - .CopySubBuffer = intelCopySubBuffer, - .setTexOffset = intelSetTexOffset, -}; - - -static __GLcontextModes * -intelFillInModes(unsigned pixel_bits, unsigned depth_bits, - unsigned stencil_bits, boolean have_back_buffer) +static __DRIconfig ** +intelFillInModes(__DRIscreenPrivate *psp, + unsigned pixel_bits, unsigned depth_bits, + unsigned stencil_bits, GLboolean have_back_buffer) { - __GLcontextModes *modes; + __DRIconfig **configs; __GLcontextModes *m; unsigned num_modes; unsigned depth_buffer_factor; unsigned back_buffer_factor; GLenum fb_format; GLenum fb_type; + int i; /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't * support pageflipping at all. @@ -508,100 +560,144 @@ intelFillInModes(unsigned pixel_bits, unsigned depth_bits, fb_type = GL_UNSIGNED_INT_8_8_8_8_REV; } - modes = - (*dri_interface->createContextModes) (num_modes, - sizeof(__GLcontextModes)); - m = modes; - if (!driFillInModes(&m, fb_format, fb_type, - depth_bits_array, stencil_bits_array, - depth_buffer_factor, back_buffer_modes, - back_buffer_factor, msaa_samples_array, 1, GLX_TRUE_COLOR)) { - fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__, - __LINE__); - return NULL; - } - if (!driFillInModes(&m, fb_format, fb_type, - depth_bits_array, stencil_bits_array, - depth_buffer_factor, back_buffer_modes, - back_buffer_factor, msaa_samples_array, 1, GLX_DIRECT_COLOR)) { - fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__, + configs = driCreateConfigs(fb_format, fb_type, + depth_bits_array, stencil_bits_array, + depth_buffer_factor, back_buffer_modes, + back_buffer_factor, msaa_samples_array, 1); + if (configs == NULL) { + fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__, __LINE__); return NULL; } /* Mark the visual as slow if there are "fake" stencil bits. */ - for (m = modes; m != NULL; m = m->next) { + for (i = 0; configs[i]; i++) { + m = &configs[i]->modes; if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) { m->visualRating = GLX_SLOW_CONFIG; } } - return modes; + return configs; } - /** - * This is the bootstrap function for the driver. libGL supplies all of the - * requisite information about the system, and the driver initializes itself. - * This routine also fills in the linked list pointed to by \c driver_modes - * with the \c __GLcontextModes that the driver can support for windows or - * pbuffers. + * This is the driver specific part of the createNewScreen entry point. + * + * \todo maybe fold this into intelInitDriver * - * \return A pointer to a \c __DRIscreenPrivate on success, or \c NULL on - * failure. + * \return the __GLcontextModes supported by this driver */ -PUBLIC void * -__driCreateNewScreen_20050727(__DRInativeDisplay * dpy, int scrn, - __DRIscreen * psc, - const __GLcontextModes * modes, - const __DRIversion * ddx_version, - const __DRIversion * dri_version, - const __DRIversion * drm_version, - const __DRIframebuffer * frame_buffer, - drmAddress pSAREA, int fd, - int internal_api_version, - const __DRIinterfaceMethods * interface, - __GLcontextModes ** driver_modes) +static const __DRIconfig **intelInitScreen(__DRIscreenPrivate *psp) { - __DRIscreenPrivate *psp; - static const __DRIversion ddx_expected = { 1, 7, 0 }; +#ifdef I915 + static const __DRIversion ddx_expected = { 1, 5, 0 }; +#else + static const __DRIversion ddx_expected = { 1, 6, 0 }; +#endif static const __DRIversion dri_expected = { 4, 0, 0 }; - static const __DRIversion drm_expected = { 1, 7, 0 }; - - dri_interface = interface; + static const __DRIversion drm_expected = { 1, 5, 0 }; + I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv; if (!driCheckDriDdxDrmVersions2("i915", - dri_version, &dri_expected, - ddx_version, &ddx_expected, - drm_version, &drm_expected)) { + &psp->dri_version, &dri_expected, + &psp->ddx_version, &ddx_expected, + &psp->drm_version, &drm_expected)) { return NULL; } - psp = __driUtilCreateNewScreen(dpy, scrn, psc, NULL, - ddx_version, dri_version, drm_version, - frame_buffer, pSAREA, fd, - internal_api_version, &intelAPI); - - if (psp != NULL) { - I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv; - *driver_modes = intelFillInModes(dri_priv->cpp * 8, - (dri_priv->cpp == 2) ? 16 : 24, - (dri_priv->cpp == 2) ? 0 : 8, 1); - - /* Calling driInitExtensions here, with a NULL context pointer, - * does not actually enable the extensions. It just makes sure - * that all the dispatch offsets for all the extensions that - * *might* be enables are known. This is needed because the - * dispatch offsets need to be known when _mesa_context_create - * is called, but we can't enable the extensions until we have a - * context pointer. - * - * Hello chicken. Hello egg. How are you two today? - */ - driInitExtensions(NULL, card_extensions, GL_FALSE); + /* Calling driInitExtensions here, with a NULL context pointer, + * does not actually enable the extensions. It just makes sure + * that all the dispatch offsets for all the extensions that + * *might* be enables are known. This is needed because the + * dispatch offsets need to be known when _mesa_context_create is + * called, but we can't enable the extensions until we have a + * context pointer. + * + * Hello chicken. Hello egg. How are you two today? + */ + driInitExtensions( NULL, card_extensions, GL_FALSE ); + //intelInitExtensions(NULL, GL_TRUE); + + if (!intelInitDriver(psp)) + return NULL; + + psp->extensions = intelScreenExtensions; + + return (const __DRIconfig **) + intelFillInModes(psp, dri_priv->cpp * 8, + (dri_priv->cpp == 2) ? 16 : 24, + (dri_priv->cpp == 2) ? 0 : 8, 1); +} + +/** + * This is the driver specific part of the createNewScreen entry point. + * + * \return the __GLcontextModes supported by this driver + */ +static const +__DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp) +{ + struct intel_screen *intelScreen; + + /* Calling driInitExtensions here, with a NULL context pointer, + * does not actually enable the extensions. It just makes sure + * that all the dispatch offsets for all the extensions that + * *might* be enables are known. This is needed because the + * dispatch offsets need to be known when _mesa_context_create is + * called, but we can't enable the extensions until we have a + * context pointer. + * + * Hello chicken. Hello egg. How are you two today? + */ + //intelInitExtensions(NULL, GL_TRUE); + + /* Allocate the private area */ + intelScreen = CALLOC_STRUCT(intel_screen); + if (!intelScreen) { + fprintf(stderr, "\nERROR! Allocating private area failed\n"); + return GL_FALSE; } + /* parse information in __driConfigOptions */ + driParseOptionInfo(&intelScreen->optionCache, + __driConfigOptions, __driNConfigOptions); + + psp->private = (void *) intelScreen; + + intelScreen->drmMinor = psp->drm_version.minor; - return (void *) psp; + /* Determine chipset ID? */ + if (!intel_get_param(psp, I915_PARAM_CHIPSET_ID, + &intelScreen->deviceID)) + return GL_FALSE; + + psp->extensions = intelScreenExtensions; + + intel_be_init_device(&intelScreen->base, psp->fd, intelScreen->deviceID); + intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer; + intelScreen->base.base.get_name = intel_get_name; + + return driConcatConfigs(intelFillInModes(psp, 16, 16, 0, 1), + intelFillInModes(psp, 32, 24, 8, 1)); } +const struct __DriverAPIRec driDriverAPI = { + .InitScreen = intelInitScreen, + .DestroyScreen = intelDestroyScreen, + .CreateContext = intelCreateContext, + .DestroyContext = intelDestroyContext, + .CreateBuffer = intelCreateBuffer, + .DestroyBuffer = intelDestroyBuffer, + .SwapBuffers = intelSwapBuffers, + .MakeCurrent = intelMakeCurrent, + .UnbindContext = intelUnbindContext, + .GetSwapInfo = intelGetSwapInfo, + .GetDrawableMSC = driDrawableGetMSC32, + .WaitForMSC = driWaitForMSC32, + .CopySubBuffer = intelCopySubBuffer, + + //.InitScreen2 = intelInitScreen2, + //.HandleDrawableConfig = intelHandleDrawableConfig, + //.HandleBufferAttach = intelHandleBufferAttach, +}; diff --git a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c index 8a18bfd9a4..34ad7eebe1 100644 --- a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c +++ b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c @@ -94,7 +94,6 @@ intelDisplaySurface(__DRIdrawablePrivate *dPriv, int i; ASSERT(surf->buffer); - ASSERT(surf->cpp == cpp); DBG(SWAP, "screen pitch %d src surface pitch %d\n", pitch, surf->stride); diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c index 3334af175b..acb5ad8f71 100644 --- a/src/gallium/winsys/xlib/xm_winsys.c +++ b/src/gallium/winsys/xlib/xm_winsys.c @@ -352,7 +352,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf) /* offset in pixels */ offset *= TILE_SIZE * TILE_SIZE; - if (XSHM_ENABLED(xm_buf)) { + if (0 && XSHM_ENABLED(xm_buf)) { ximage->data = (char *) xm_buf->data + 4 * offset; /* make copy of tile data */ memcpy(tmpTile, (uint *) ximage->data, sizeof(tmpTile)); @@ -365,7 +365,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf) #endif } else { - /* twiddel from ximage buffer to temp tile */ + /* twiddle from ximage buffer to temp tile */ twiddle_tile((uint *) xm_buf->data + offset, tmpTile); /* display temp tile data */ ximage->data = (char *) tmpTile; |