diff options
author | Ben Skeggs <skeggsb@gmail.com> | 2008-11-10 15:53:51 +1100 |
---|---|---|
committer | Ben Skeggs <skeggsb@gmail.com> | 2008-11-10 15:53:51 +1100 |
commit | 32e6be6362e44609d36c2fb20a4c858f57c908fb (patch) | |
tree | 4ed99e93ef5f4a8bb51653917c911e04e42f5235 /src/gallium | |
parent | 92674bc8889e10e580c630cf85c106fa6eb34d7b (diff) | |
parent | 399da3a337932c6074a69ac73e711138271308eb (diff) |
Merge remote branch 'origin/gallium-0.2' into gallium-0.2
Diffstat (limited to 'src/gallium')
48 files changed, 1806 insertions, 584 deletions
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 87ec6ae20c..3c175f31d8 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -33,6 +33,8 @@ #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" +#include "draw/draw_vs.h" +#include "tgsi/tgsi_dump.h" static unsigned trim( unsigned count, unsigned first, unsigned incr ) { @@ -176,6 +178,92 @@ void draw_pt_destroy( struct draw_context *draw ) } +/** + * Debug- print the first 'count' vertices. + */ +static void +draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) +{ + uint i; + + debug_printf("Draw arrays(prim = %u, start = %u, count = %u)\n", + prim, start, count); + + for (i = 0; i < count; i++) { + uint ii, j; + + if (draw->pt.user.elts) { + /* indexed arrays */ + switch (draw->pt.user.eltSize) { + case 1: + { + const ubyte *elem = (const ubyte *) draw->pt.user.elts; + ii = elem[start + i]; + } + break; + case 2: + { + const ushort *elem = (const ushort *) draw->pt.user.elts; + ii = elem[start + i]; + } + break; + case 4: + { + const uint *elem = (const uint *) draw->pt.user.elts; + ii = elem[start + i]; + } + break; + default: + assert(0); + } + debug_printf("Element[%u + %u] -> Vertex %u:\n", start, i, ii); + } + else { + /* non-indexed arrays */ + ii = start + i; + debug_printf("Vertex %u:\n", ii); + } + + for (j = 0; j < draw->pt.nr_vertex_elements; j++) { + uint buf = draw->pt.vertex_element[j].vertex_buffer_index; + ubyte *ptr = (ubyte *) draw->pt.user.vbuffer[buf]; + ptr += draw->pt.vertex_buffer[buf].pitch * ii; + ptr += draw->pt.vertex_element[j].src_offset; + + debug_printf(" Attr %u: ", j); + switch (draw->pt.vertex_element[j].src_format) { + case PIPE_FORMAT_R32_FLOAT: + { + float *v = (float *) ptr; + debug_printf("%f @ %p\n", v[0], (void *) v); + } + break; + case PIPE_FORMAT_R32G32_FLOAT: + { + float *v = (float *) ptr; + debug_printf("%f %f @ %p\n", v[0], v[1], (void *) v); + } + break; + case PIPE_FORMAT_R32G32B32_FLOAT: + { + float *v = (float *) ptr; + debug_printf("%f %f %f @ %p\n", v[0], v[1], v[2], (void *) v); + } + break; + case PIPE_FORMAT_R32G32B32A32_FLOAT: + { + float *v = (float *) ptr; + debug_printf("%f %f %f %f @ %p\n", v[0], v[1], v[2], v[3], + (void *) v); + } + break; + default: + debug_printf("other format (fix me)\n"); + ; + } + } + } +} /** @@ -195,6 +283,31 @@ draw_arrays(struct draw_context *draw, unsigned prim, draw->reduced_prim = reduced_prim; } + if (0) + draw_print_arrays(draw, prim, start, MIN2(count, 20)); + +#if 0 + { + int i; + debug_printf("draw_arrays(prim=%u start=%u count=%u):\n", + prim, start, count); + tgsi_dump(draw->vs.vertex_shader->state.tokens, 0); + debug_printf("Elements:\n"); + for (i = 0; i < draw->pt.nr_vertex_elements; i++) { + debug_printf(" format=%s comps=%u\n", + pf_name(draw->pt.vertex_element[i].src_format), + draw->pt.vertex_element[i].nr_components); + } + debug_printf("Buffers:\n"); + for (i = 0; i < draw->pt.nr_vertex_buffers; i++) { + debug_printf(" pitch=%u offset=%u ptr=%p\n", + draw->pt.vertex_buffer[i].pitch, + draw->pt.vertex_buffer[i].buffer_offset, + draw->pt.user.vbuffer[i]); + } + } +#endif + /* drawing done here: */ draw_pt_arrays(draw, prim, start, count); } diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 87232865e2..6141ba9cbf 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1632,6 +1632,17 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst return TRUE; } +static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg tmp0 = aos_get_xmm_reg(cp); + + sse2_cvttps2dq(cp->func, tmp0, arg0); + sse2_cvtdq2ps(cp->func, tmp0, tmp0); + + store_dest(cp, &op->FullDstRegisters[0], tmp0); + return TRUE; +} static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { @@ -1770,6 +1781,9 @@ emit_instruction( struct aos_compilation *cp, case TGSI_OPCODE_SIN: return emit_SIN(cp, inst); + case TGSI_OPCODE_TRUNC: + return emit_TRUNC(cp, inst); + case TGSI_OPCODE_END: return TRUE; @@ -2176,7 +2190,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, if (!vaos->buffer) goto fail; - debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); + if (0) + debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); #if 0 tgsi_dump(vs->state.tokens, 0); diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index 13d4fcfdbf..80c3606657 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -123,6 +123,12 @@ vs_exec_run_linear( struct draw_vertex_shader *shader, input = (const float (*)[4])((const char *)input + input_stride); } + tgsi_set_exec_mask(machine, + 1, + max_vertices > 1, + max_vertices > 2, + max_vertices > 3); + /* run interpreter */ tgsi_exec_machine_run( machine ); diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c index 8eff6d4fda..8b75136144 100644 --- a/src/gallium/auxiliary/draw/draw_vs_ppc.c +++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c @@ -54,31 +54,16 @@ typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4], float (*outputs)[4][4], float (*temps)[4][4], - float (*immeds)[4][4], + float (*immeds)[4], float (*consts)[4], const float *builtins); -#if 0 - const struct tgsi_exec_vector *input, - struct tgsi_exec_vector *output, - float (*constant)[4], /* 3 */ - struct tgsi_exec_vector *temporary, /* 4 */ - float (*immediates)[4], /* 5 */ - const float (*aos_input)[4], /* 6 */ - uint num_inputs, /* 7 */ - uint input_stride, /* 8 */ - float (*aos_output)[4], /* 9 */ - uint num_outputs, /* 10 */ - uint output_stride ); /* 11 */ -#endif struct draw_ppc_vertex_shader { struct draw_vertex_shader base; struct ppc_function ppc_program; codegen_function func; - - struct tgsi_exec_machine *machine; }; @@ -86,11 +71,12 @@ static void vs_ppc_prepare( struct draw_vertex_shader *base, struct draw_context *draw ) { + /* nothing */ } - -/* Simplified vertex shader interface for the pt paths. Given the +/** + * Simplified vertex shader interface for the pt paths. Given the * complexity of code-generating all the above operations together, * it's time to try doing all the other stuff separately. */ @@ -104,7 +90,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base, unsigned output_stride ) { struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base; - struct tgsi_exec_machine *machine = shader->machine; unsigned int i; #define MAX_VERTICES 4 @@ -137,27 +122,11 @@ vs_ppc_run_linear( struct draw_vertex_shader *base, /* run compiled shader */ -#if 0 - shader->func(machine->Inputs, - machine->Outputs, - (float (*)[4])constants, - machine->Temps, - (float (*)[4])shader->base.immediates, - input, - base->info.num_inputs, - input_stride, - output, - base->info.num_outputs, - output_stride ); -#else shader->func(inputs_soa, outputs_soa, temps_soa, - (float (*)[4][4]) shader->base.immediates, + (float (*)[4]) shader->base.immediates, (float (*)[4]) constants, ppc_builtin_constants); - /*output[0][0] = input[0][0] * 0.5;*/ -#endif - /* convert (up to) four output verts from SoA back to AoS format */ for (attr = 0; attr < base->info.num_outputs; attr++) { float *vOut = (float *) output; @@ -183,8 +152,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base, } - - static void vs_ppc_delete( struct draw_vertex_shader *base ) { @@ -201,7 +168,7 @@ vs_ppc_delete( struct draw_vertex_shader *base ) struct draw_vertex_shader * draw_create_vs_ppc(struct draw_context *draw, - const struct pipe_shader_state *templ) + const struct pipe_shader_state *templ) { struct draw_ppc_vertex_shader *vs; @@ -227,16 +194,14 @@ draw_create_vs_ppc(struct draw_context *draw, vs->base.run_linear = vs_ppc_run_linear; vs->base.delete = vs_ppc_delete; - vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 * + vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * sizeof(float), 16); - vs->machine = &draw->vs.machine; - - ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */ + ppc_init_func( &vs->ppc_program ); if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens, &vs->ppc_program, - (float (*)[4])vs->base.immediates, + (float (*)[4]) vs->base.immediates, TRUE )) goto fail; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index b11ae31662..77ba5152f9 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -37,7 +37,7 @@ #include "draw_vs.h" -#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) +#if defined(PIPE_ARCH_X86) #include "pipe/p_shader_tokens.h" @@ -99,9 +99,23 @@ vs_sse_run_linear( struct draw_vertex_shader *base, struct tgsi_exec_machine *machine = shader->machine; unsigned int i; + /* By default, execute all channels. XXX move this inside the loop + * below when we support shader conditionals/loops. + */ + tgsi_set_exec_mask(machine, 1, 1, 1, 1); + for (i = 0; i < count; i += MAX_TGSI_VERTICES) { unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i); + if (max_vertices < 4) { + /* disable the unused execution channels */ + tgsi_set_exec_mask(machine, + 1, + max_vertices > 1, + max_vertices > 2, + 0); + } + /* run compiled shader */ shader->func(machine->Inputs, diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp index 3a2f2878a3..93a9748bdb 100644 --- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp +++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp @@ -179,8 +179,7 @@ struct gallivm_cpu_engine * gallivm_global_cpu_engine() typedef void (*vertex_shader_runner)(void *ainputs, void *dests, - float (*aconsts)[4], - void *temps); + float (*aconsts)[4]); #define MAX_TGSI_VERTICES 4 /*! @@ -223,8 +222,7 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog, /* run shader */ runner(machine->Inputs, machine->Outputs, - (float (*)[4]) constants, - machine->Temps); + (float (*)[4]) constants); /* Unswizzle all output results */ diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp index 4fc075cf6d..e1e5cabcf5 100644 --- a/src/gallium/auxiliary/gallivm/storagesoa.cpp +++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp @@ -48,13 +48,11 @@ using namespace llvm; StorageSoa::StorageSoa(llvm::BasicBlock *block, llvm::Value *input, llvm::Value *output, - llvm::Value *consts, - llvm::Value *temps) + llvm::Value *consts) : m_block(block), m_input(input), m_output(output), m_consts(consts), - m_temps(temps), m_immediates(0), m_idx(0) { @@ -169,7 +167,7 @@ std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, { llvm::Value* res; std::vector<llvm::Value*> res2(4); - llvm::Value *xChannel, *yChannel, *zChannel, *wChannel; + llvm::Value *xChannel; xChannel = elementPointer(m_consts, idx, 0); @@ -195,14 +193,15 @@ std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx) return res; } -std::vector<llvm::Value*> StorageSoa::tempElement(llvm::Value *idx) +std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx) { std::vector<llvm::Value*> res(4); + llvm::Value *temp = m_temps[idx]; - res[0] = element(m_temps, idx, 0); - res[1] = element(m_temps, idx, 1); - res[2] = element(m_temps, idx, 2); - res[3] = element(m_temps, idx, 3); + res[0] = element(temp, constantInt(0), 0); + res[1] = element(temp, constantInt(0), 1); + res[2] = element(temp, constantInt(0), 2); + res[3] = element(temp, constantInt(0), 3); return res; } @@ -326,7 +325,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in val = outputElement(realIndex); break; case TGSI_FILE_TEMPORARY: - val = tempElement(realIndex); + val = tempElement(m_builder, idx); break; case TGSI_FILE_CONSTANT: val = constElement(m_builder, realIndex); @@ -355,19 +354,39 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in return res; } +llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder) +{ + VectorType *vector = VectorType::get(Type::FloatTy, 4); + ArrayType *vecArray = ArrayType::get(vector, 4); + AllocaInst *alloca = new AllocaInst(vecArray, "temp", + m_builder->GetInsertBlock()); + + return alloca; +} + + void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val, - int mask) + int mask, llvm::IRBuilder<>* m_builder) { llvm::Value *out = 0; + llvm::Value *realIndex = 0; switch(type) { case TGSI_FILE_OUTPUT: out = m_output; + realIndex = constantInt(idx); break; case TGSI_FILE_TEMPORARY: - out = m_temps; + // if that temp doesn't already exist, alloca it + if (m_temps.find(idx) == m_temps.end()) + m_temps[idx] = allocaTemp(m_builder); + + out = m_temps[idx]; + + realIndex = constantInt(0); break; case TGSI_FILE_INPUT: out = m_input; + realIndex = constantInt(idx); break; case TGSI_FILE_ADDRESS: { llvm::Value *addr = m_addresses[idx]; @@ -385,7 +404,6 @@ void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm assert(0); break; } - llvm::Value *realIndex = constantInt(idx); if ((mask & TGSI_WRITEMASK_X)) { llvm::Value *xChannel = elementPointer(out, realIndex, 0); new StoreInst(val[0], xChannel, false, m_block); diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h index f21ca6ec43..56886f85e7 100644 --- a/src/gallium/auxiliary/gallivm/storagesoa.h +++ b/src/gallium/auxiliary/gallivm/storagesoa.h @@ -52,14 +52,13 @@ public: StorageSoa(llvm::BasicBlock *block, llvm::Value *input, llvm::Value *output, - llvm::Value *consts, - llvm::Value *temps); + llvm::Value *consts); std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0); void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val, - int mask); + int mask, llvm::IRBuilder<>* m_builder); void addImmediate(float *vec); void declareImmediates(); @@ -84,7 +83,7 @@ private: llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc); std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx); std::vector<llvm::Value*> outputElement(llvm::Value *indIdx); - std::vector<llvm::Value*> tempElement(llvm::Value *indIdx); + std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx); std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx); private: llvm::BasicBlock *m_block; @@ -92,12 +91,13 @@ private: llvm::Value *m_input; llvm::Value *m_output; llvm::Value *m_consts; - llvm::Value *m_temps; + std::map<int, llvm::Value*> m_temps; llvm::GlobalVariable *m_immediates; std::map<int, llvm::Value*> m_addresses; std::vector<std::vector<float> > m_immediatesToFlush; + llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder); mutable std::map<int, llvm::ConstantInt*> m_constInts; mutable char m_name[32]; diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp index 1191a6cae9..c11b88af9e 100644 --- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp +++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp @@ -53,7 +53,6 @@ static inline FunctionType *vertexShaderFunctionType() // [4 x <4 x float>] inputs, // [4 x <4 x float>] output, // [4 x [1 x float]] consts, - // [4 x <4 x float>] temps std::vector<const Type*> funcArgs; VectorType *vectorType = VectorType::get(Type::FloatTy, 4); @@ -67,7 +66,6 @@ static inline FunctionType *vertexShaderFunctionType() funcArgs.push_back(vectorArrayPtr);//inputs funcArgs.push_back(vectorArrayPtr);//output funcArgs.push_back(constsArrayPtr);//consts - funcArgs.push_back(vectorArrayPtr);//temps FunctionType *functionType = FunctionType::get( /*Result=*/Type::VoidTy, @@ -246,7 +244,6 @@ translate_instruction(llvm::Module *module, val = storage->constElement(src->SrcRegister.Index, indIdx); } else if (src->SrcRegister.File == TGSI_FILE_INPUT) { val = storage->inputElement(src->SrcRegister.Index, indIdx); - // FIXME we should not be generating elements for temporaries, this creates useless memory writes } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) { val = storage->tempElement(src->SrcRegister.Index); } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) { @@ -677,7 +674,6 @@ translate_instruction(llvm::Module *module, if (dst->DstRegister.File == TGSI_FILE_OUTPUT) { storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask); - // FIXME we should not be generating elements for temporaries, this creates useless memory writes } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) { storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask); } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) { @@ -1027,7 +1023,8 @@ translate_instructionir(llvm::Module *module, for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) { struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i]; storage->store((enum tgsi_file_type)dst->DstRegister.File, - dst->DstRegister.Index, out, dst->DstRegister.WriteMask); + dst->DstRegister.Index, out, dst->DstRegister.WriteMask, + instr->getIRBuilder() ); } } @@ -1122,8 +1119,6 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir, output->setName("outputs"); Value *consts = args++; consts->setName("consts"); - Value *temps = args++; - temps->setName("temps"); BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0); @@ -1132,7 +1127,7 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir, fi = tgsi_default_full_instruction(); fd = tgsi_default_full_declaration(); - StorageSoa storage(label_entry, input, output, consts, temps); + StorageSoa storage(label_entry, input, output, consts); InstructionsSoa instr(mod, shader, label_entry, &storage); while(!tgsi_parse_end_of_tokens(&parse)) { diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h index 8505d333bd..19db8a6a91 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h @@ -177,12 +177,16 @@ pb_get_base_buffer( struct pb_buffer *buf, } +/** + * Don't call this directly. Use pb_reference instead. + */ static INLINE void pb_destroy(struct pb_buffer *buf) { assert(buf); if(!buf) return; + assert(buf->base.refcount == 0); buf->vtbl->destroy(buf); } @@ -193,11 +197,16 @@ static INLINE void pb_reference(struct pb_buffer **dst, struct pb_buffer *src) { - if (src) + if (src) { + assert(src->base.refcount); src->base.refcount++; + } - if (*dst && --(*dst)->base.refcount == 0) - pb_destroy( *dst ); + if (*dst) { + assert((*dst)->base.refcount); + if(--(*dst)->base.refcount == 0) + pb_destroy( *dst ); + } *dst = src; } diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c index 633ee70a75..e2594ea236 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c @@ -86,8 +86,7 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr, fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf); if(!fenced_buf) { - assert(buf->base.refcount == 1); - pb_destroy(buf); + pb_reference(&buf, NULL); } return fenced_buf; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c index fe80ca30ee..a976d3041a 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c @@ -100,7 +100,7 @@ mm_buffer_destroy(struct pb_buffer *buf) assert(buf->base.refcount == 0); pipe_mutex_lock(mm->mutex); - mmFreeMem(mm_buf->block); + u_mmFreeMem(mm_buf->block); FREE(buf); pipe_mutex_unlock(mm->mutex); } @@ -175,14 +175,14 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr, mm_buf->mgr = mm; - mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0); + mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0); if(!mm_buf->block) { debug_printf("warning: heap full\n"); #if 0 mmDumpMemInfo(mm->heap); #endif - mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0); + mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0); if(!mm_buf->block) { FREE(mm_buf); pipe_mutex_unlock(mm->mutex); @@ -213,7 +213,7 @@ mm_bufmgr_destroy(struct pb_manager *mgr) pipe_mutex_lock(mm->mutex); - mmDestroy(mm->heap); + u_mmDestroy(mm->heap); pb_unmap(mm->buffer); pb_reference(&mm->buffer, NULL); @@ -254,7 +254,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, if(!mm->map) goto failure; - mm->heap = mmInit(0, size); + mm->heap = u_mmInit(0, size); if (!mm->heap) goto failure; @@ -262,7 +262,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, failure: if(mm->heap) - mmDestroy(mm->heap); + u_mmDestroy(mm->heap); if(mm->map) pb_unmap(mm->buffer); if(mm) diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c index 19087589a8..be7433baf8 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c +++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c @@ -38,12 +38,13 @@ #include "rtasm_execmem.h" -#if defined(__linux__) +#if defined(PIPE_OS_LINUX) + /* * Allocate a large block of memory which can hold code then dole it out * in pieces by means of the generic memory manager code. -*/ + */ #include <unistd.h> #include <sys/mman.h> @@ -62,7 +63,7 @@ static void init_heap(void) { if (!exec_heap) - exec_heap = mmInit( 0, EXEC_HEAP_SIZE ); + exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE ); if (!exec_mem) exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, @@ -82,8 +83,8 @@ rtasm_exec_malloc(size_t size) init_heap(); if (exec_heap) { - size = (size + 31) & ~31; - block = mmAllocMem( exec_heap, size, 32, 0 ); + size = (size + 31) & ~31; /* next multiple of 32 bytes */ + block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */ } if (block) @@ -103,17 +104,17 @@ rtasm_exec_free(void *addr) pipe_mutex_lock(exec_mutex); if (exec_heap) { - struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem); + struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem); if (block) - mmFreeMem(block); + u_mmFreeMem(block); } pipe_mutex_unlock(exec_mutex); } -#else +#else /* PIPE_OS_LINUX */ /* * Just use regular memory. @@ -133,4 +134,4 @@ rtasm_exec_free(void *addr) } -#endif +#endif /* PIPE_OS_LINUX */ diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c index 7dd8263749..6d11263be8 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c @@ -38,17 +38,18 @@ #include <stdio.h> #include "util/u_memory.h" #include "pipe/p_debug.h" +#include "rtasm_execmem.h" #include "rtasm_ppc.h" void -ppc_init_func(struct ppc_function *p, unsigned max_inst) +ppc_init_func(struct ppc_function *p) { uint i; - p->store = align_malloc(max_inst * PPC_INST_SIZE, 16); p->num_inst = 0; - p->max_inst = max_inst; + p->max_inst = 100; /* first guess at buffer size */ + p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE); p->reg_used = 0x0; p->fp_used = 0x0; p->vec_used = 0x0; @@ -66,12 +67,19 @@ ppc_release_func(struct ppc_function *p) { assert(p->num_inst <= p->max_inst); if (p->store != NULL) { - align_free(p->store); + rtasm_exec_free(p->store); } p->store = NULL; } +uint +ppc_num_instructions(const struct ppc_function *p) +{ + return p->num_inst; +} + + void (*ppc_get_func(struct ppc_function *p))(void) { #if 0 @@ -202,6 +210,35 @@ ppc_release_vec_register(struct ppc_function *p, int reg) } +/** + * Append instruction to instruction buffer. Grow buffer if out of room. + */ +static void +emit_instruction(struct ppc_function *p, uint32_t inst_bits) +{ + if (!p->store) + return; /* out of memory, drop the instruction */ + + if (p->num_inst == p->max_inst) { + /* allocate larger buffer */ + uint32_t *newbuf; + p->max_inst *= 2; /* 2x larger */ + newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE); + if (newbuf) { + memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE); + } + rtasm_exec_free(p->store); + p->store = newbuf; + if (!p->store) { + /* out of memory */ + p->num_inst = 0; + return; + } + } + + p->store[p->num_inst++] = inst_bits; +} + union vx_inst { uint32_t bits; @@ -223,8 +260,7 @@ emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB) inst.inst.vA = vA; inst.inst.vB = vB; inst.inst.op2 = op2; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); }; @@ -250,8 +286,7 @@ emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB) inst.inst.vB = vB; inst.inst.rC = 0; inst.inst.op2 = op2; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); }; @@ -277,8 +312,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC) inst.inst.vB = vB; inst.inst.vC = vC; inst.inst.op2 = op2; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); }; @@ -300,8 +334,7 @@ emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk) inst.inst.li = li; inst.inst.aa = aa; inst.inst.lk = lk; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); } @@ -330,8 +363,7 @@ emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh, inst.inst.bh = bh; inst.inst.op2 = op2; inst.inst.lk = lk; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); } static INLINE void @@ -373,8 +405,7 @@ emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2) inst.inst.rb = rb; inst.inst.op2 = op2; inst.inst.unused = 0x0; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); } @@ -398,8 +429,7 @@ emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si) inst.inst.rt = rt; inst.inst.ra = ra; inst.inst.si = (unsigned) (si & 0xffff); - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); }; @@ -428,8 +458,7 @@ emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2, inst.inst.unused = 0x0; inst.inst.op2 = op2; inst.inst.rc = rc; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); }; @@ -458,8 +487,7 @@ emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe, inst.inst.oe = oe; inst.inst.op2 = op2; inst.inst.rc = rc; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); } @@ -505,6 +533,13 @@ ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) emit_va(p, 46, vD, vA, vC, vB); /* note arg order */ } +/** vector float negative mult subtract: vD = vA - vB * vC */ +void +ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) +{ + emit_va(p, 47, vD, vB, vA, vC); /* note arg order */ +} + /** vector float compare greater than */ void ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB) diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h index f938d8d759..afb4704c39 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h @@ -62,8 +62,9 @@ struct ppc_function -extern void ppc_init_func(struct ppc_function *p, unsigned max_inst); +extern void ppc_init_func(struct ppc_function *p); extern void ppc_release_func(struct ppc_function *p); +extern uint ppc_num_instructions(const struct ppc_function *p); extern void (*ppc_get_func( struct ppc_function *p ))( void ); extern void ppc_dump_func(const struct ppc_function *p); @@ -97,10 +98,14 @@ ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB); extern void ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB); -/** vector float mult add */ +/** vector float mult add: vD = vA * vB + vC */ extern void ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); +/** vector float negative mult subtract: vD = vA - vB * vC */ +extern void +ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); + /** vector float compare greater than */ extern void ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB); diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index dea1aed032..f8568f690b 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -185,6 +185,34 @@ reg_name(int reg) } +static void +emit_instruction(struct spe_function *p, uint32_t inst_bits) +{ + if (!p->store) + return; /* out of memory, drop the instruction */ + + if (p->num_inst == p->max_inst) { + /* allocate larger buffer */ + uint32_t *newbuf; + p->max_inst *= 2; /* 2x larger */ + newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16); + if (newbuf) { + memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE); + } + align_free(p->store); + p->store = newbuf; + if (!p->store) { + /* out of memory */ + p->num_inst = 0; + return; + } + } + + p->store[p->num_inst++] = inst_bits; +} + + + static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, unsigned rA, unsigned rB, const char *name) { @@ -193,8 +221,7 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rB = rB; inst.inst.rA = rA; inst.inst.rT = rT; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); if (p->print) { indent(p); printf("%s\t%s, %s, %s\n", @@ -212,8 +239,7 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rB = rB; inst.inst.rA = rA; inst.inst.rC = rC; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); if (p->print) { indent(p); printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT), @@ -230,8 +256,7 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT, inst.inst.i7 = imm; inst.inst.rA = rA; inst.inst.rT = rT; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); if (p->print) { indent(p); printf("%s\t%s, %s, 0x%x\n", @@ -249,8 +274,7 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT, inst.inst.i8 = imm; inst.inst.rA = rA; inst.inst.rT = rT; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); if (p->print) { indent(p); printf("%s\t%s, %s, 0x%x\n", @@ -268,8 +292,7 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT, inst.inst.i10 = imm; inst.inst.rA = rA; inst.inst.rT = rT; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); if (p->print) { indent(p); printf("%s\t%s, %s, 0x%x\n", @@ -295,8 +318,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, inst.inst.op = op; inst.inst.i16 = imm; inst.inst.rT = rT; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); if (p->print) { indent(p); printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm); @@ -311,8 +333,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, inst.inst.op = op; inst.inst.i18 = imm; inst.inst.rT = rT; - p->store[p->num_inst++] = inst.bits; - assert(p->num_inst <= p->max_inst); + emit_instruction(p, inst.bits); if (p->print) { indent(p); printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm); @@ -394,15 +415,19 @@ void _name (struct spe_function *p, int imm) \ /** * Initialize an spe_function. - * \param code_size size of instruction buffer to allocate, in bytes. + * \param code_size initial size of instruction buffer to allocate, in bytes. + * If zero, use a default. */ void spe_init_func(struct spe_function *p, unsigned code_size) { unsigned int i; - p->store = align_malloc(code_size, 16); + if (!code_size) + code_size = 64; + p->num_inst = 0; p->max_inst = code_size / SPE_INST_SIZE; + p->store = align_malloc(code_size, 16); p->set_count = 0; memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0])); diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 1a5294eabc..1da04ab7e0 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -958,6 +958,10 @@ fetch_src_file_channel( switch( file ) { case TGSI_FILE_CONSTANT: assert(mach->Consts); + assert(index->i[0] >= 0); + assert(index->i[1] >= 0); + assert(index->i[2] >= 0); + assert(index->i[3] >= 0); chan->f[0] = mach->Consts[index->i[0]][swizzle]; chan->f[1] = mach->Consts[index->i[1]][swizzle]; chan->f[2] = mach->Consts[index->i[2]][swizzle]; @@ -1041,12 +1045,16 @@ fetch_source( if (reg->SrcRegister.Indirect) { union tgsi_exec_channel index2; union tgsi_exec_channel indir_index; + const uint execmask = mach->ExecMask; + uint i; + /* which address register (always zero now) */ index2.i[0] = index2.i[1] = index2.i[2] = index2.i[3] = reg->SrcRegisterInd.Index; + /* get current value of address register[swizzle] */ swizzle = tgsi_util_get_src_register_swizzle( ®->SrcRegisterInd, CHAN_X ); fetch_src_file_channel( mach, @@ -1055,10 +1063,19 @@ fetch_source( &index2, &indir_index ); + /* add value of address register to the offset */ index.i[0] += indir_index.i[0]; index.i[1] += indir_index.i[1]; index.i[2] += indir_index.i[2]; index.i[3] += indir_index.i[3]; + + /* for disabled execution channels, zero-out the index to + * avoid using a potential garbage value. + */ + for (i = 0; i < QUAD_SIZE; i++) { + if ((execmask & (1 << i)) == 0) + index.i[i] = 0; + } } if( reg->SrcRegister.Dimension ) { @@ -1087,6 +1104,8 @@ fetch_source( if (reg->SrcRegisterDim.Indirect) { union tgsi_exec_channel index2; union tgsi_exec_channel indir_index; + const uint execmask = mach->ExecMask; + uint i; index2.i[0] = index2.i[1] = @@ -1105,6 +1124,14 @@ fetch_source( index.i[1] += indir_index.i[1]; index.i[2] += indir_index.i[2]; index.i[3] += indir_index.i[3]; + + /* for disabled execution channels, zero-out the index to + * avoid using a potential garbage value. + */ + for (i = 0; i < QUAD_SIZE; i++) { + if ((execmask & (1 << i)) == 0) + index.i[i] = 0; + } } } @@ -2007,7 +2034,21 @@ exec_instruction( case TGSI_OPCODE_DOT2ADD: /* TGSI_OPCODE_DP2A */ - assert (0); + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH( &r[2], 2, CHAN_X ); + micro_add( &r[0], &r[0], &r[2] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } break; case TGSI_OPCODE_INDEX: @@ -2436,7 +2477,66 @@ exec_instruction( break; case TGSI_OPCODE_NRM: - assert (0); + /* 3-component vector normalize */ + { + union tgsi_exec_channel tmp, dot; + + /* tmp = dp3(src0, src0): */ + FETCH( &r[0], 0, CHAN_X ); + micro_mul( &tmp, &r[0], &r[0] ); + + FETCH( &r[1], 0, CHAN_Y ); + micro_mul( &dot, &r[1], &r[1] ); + micro_add( &tmp, &tmp, &dot ); + + FETCH( &r[2], 0, CHAN_Z ); + micro_mul( &dot, &r[2], &r[2] ); + micro_add( &tmp, &tmp, &dot ); + + /* tmp = 1 / sqrt(tmp) */ + micro_sqrt( &tmp, &tmp ); + micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp ); + + /* note: w channel is undefined */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + /* chan = chan * tmp */ + micro_mul( &r[chan_index], &tmp, &r[chan_index] ); + STORE( &r[chan_index], 0, chan_index ); + } + } + break; + + case TGSI_OPCODE_NRM4: + /* 4-component vector normalize */ + { + union tgsi_exec_channel tmp, dot; + + /* tmp = dp4(src0, src0): */ + FETCH( &r[0], 0, CHAN_X ); + micro_mul( &tmp, &r[0], &r[0] ); + + FETCH( &r[1], 0, CHAN_Y ); + micro_mul( &dot, &r[1], &r[1] ); + micro_add( &tmp, &tmp, &dot ); + + FETCH( &r[2], 0, CHAN_Z ); + micro_mul( &dot, &r[2], &r[2] ); + micro_add( &tmp, &tmp, &dot ); + + FETCH( &r[3], 0, CHAN_W ); + micro_mul( &dot, &r[3], &r[3] ); + micro_add( &tmp, &tmp, &dot ); + + /* tmp = 1 / sqrt(tmp) */ + micro_sqrt( &tmp, &tmp ); + micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + /* chan = chan * tmp */ + micro_mul( &r[chan_index], &tmp, &r[chan_index] ); + STORE( &r[chan_index], 0, chan_index ); + } + } break; case TGSI_OPCODE_DIV: diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h index c4e649e69c..fc40a25e09 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.h +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h @@ -165,6 +165,10 @@ struct tgsi_exec_labels #define TGSI_EXEC_TEMP_HALF_I (TGSI_EXEC_NUM_TEMPS + 3) #define TGSI_EXEC_TEMP_HALF_C 1 +/* execution mask, each value is either 0 or ~0 */ +#define TGSI_EXEC_MASK_I (TGSI_EXEC_NUM_TEMPS + 3) +#define TGSI_EXEC_MASK_C 2 + #define TGSI_EXEC_TEMP_R0 (TGSI_EXEC_NUM_TEMPS + 4) #define TGSI_EXEC_TEMP_ADDR (TGSI_EXEC_NUM_TEMPS + 5) @@ -265,6 +269,27 @@ void tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach); +static INLINE void +tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask) +{ + mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] = + mask; +} + + +/** Set execution mask values prior to executing the shader */ +static INLINE void +tgsi_set_exec_mask(struct tgsi_exec_machine *mach, + boolean ch0, boolean ch1, boolean ch2, boolean ch3) +{ + int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i; + mask[0] = ch0 ? ~0 : 0; + mask[1] = ch1 ? ~0 : 0; + mask[2] = ch2 ? ~0 : 0; + mask[3] = ch3 ? ~0 : 0; +} + + #if defined __cplusplus } /* extern "C" */ #endif diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c index 9ad7ecd7cf..a92b1902e3 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c +++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c @@ -40,6 +40,7 @@ #include "util/u_sse.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" +#include "tgsi_dump.h" #include "tgsi_exec.h" #include "tgsi_ppc.h" #include "rtasm/rtasm_ppc.h" @@ -72,11 +73,20 @@ const float ppc_builtin_constants[] ALIGN16_ATTRIB = { #define CHAN_Z 2 #define CHAN_W 3 -#define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I -#define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C -#define TEMP_R0 TGSI_EXEC_TEMP_R0 -#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR +/** + * How many TGSI temps should be implemented with real PPC vector registers + * rather than memory. + */ +#define MAX_PPC_TEMPS 4 + + +struct reg_chan_vec +{ + struct tgsi_full_src_register src; + uint chan; + uint vec; +}; /** @@ -92,12 +102,105 @@ struct gen_context int const_reg; /**< GP register pointing to constants buffer */ int builtins_reg; /**< GP register pointint to built-in constants */ + int offset_reg; /**< used to reduce redundant li instructions */ + int offset_value; + int one_vec; /**< vector register with {1.0, 1.0, 1.0, 1.0} */ int bit31_vec; /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */ + + /** + * Map TGSI temps to PPC vector temps. + * We have 32 PPC vector regs. Use 16 of them for storing 4 TGSI temps. + * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1]. + */ + int temps_map[MAX_PPC_TEMPS][4]; + + /** + * Cache of src registers. + * This is used to avoid redundant load instructions. + */ + struct { + struct tgsi_full_src_register src; + uint chan; + uint vec; + } regs[12]; /* 3 src regs, 4 channels */ + uint num_regs; }; /** + * Initialize code generation context. + */ +static void +init_gen_context(struct gen_context *gen, struct ppc_function *func) +{ + uint i; + + memset(gen, 0, sizeof(*gen)); + gen->f = func; + gen->inputs_reg = ppc_reserve_register(func, 3); /* first function param */ + gen->outputs_reg = ppc_reserve_register(func, 4); /* second function param */ + gen->temps_reg = ppc_reserve_register(func, 5); /* ... */ + gen->immed_reg = ppc_reserve_register(func, 6); + gen->const_reg = ppc_reserve_register(func, 7); + gen->builtins_reg = ppc_reserve_register(func, 8); + gen->one_vec = -1; + gen->bit31_vec = -1; + gen->offset_reg = -1; + gen->offset_value = -9999999; + for (i = 0; i < MAX_PPC_TEMPS; i++) { + gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f); + gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f); + gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f); + gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f); + } +} + + +/** + * All PPC vector load/store instructions form an effective address + * by adding the contents of two registers. For example: + * lvx v2,r8,r9 # v2 = memory[r8 + r9] + * stvx v2,r8,r9 # memory[r8 + r9] = v2; + * So our lvx/stvx instructions are typically preceded by an 'li' instruction + * to load r9 (above) with an immediate (an offset). + * This code emits that 'li' instruction, but only if the offset value is + * different than the previous 'li'. + * This optimization seems to save about 10% in the instruction count. + * Note that we need to unconditionally emit an 'li' inside basic blocks + * (such as inside loops). + */ +static int +emit_li_offset(struct gen_context *gen, int offset) +{ + if (gen->offset_reg <= 0) { + /* allocate a GP register for storing load/store offset */ + gen->offset_reg = ppc_allocate_register(gen->f); + } + + /* emit new 'li' if offset is changing */ + if (gen->offset_value < 0 || gen->offset_value != offset) { + gen->offset_value = offset; + ppc_li(gen->f, gen->offset_reg, offset); + } + + return gen->offset_reg; +} + + +/** + * Forces subsequent emit_li_offset() calls to emit an 'li'. + * To be called at the top of basic blocks. + */ +static void +reset_li_offset(struct gen_context *gen) +{ + gen->offset_value = -9999999; +} + + + +/** * Load the given vector register with {value, value, value, value}. * The value must be in the ppu_builtin_constants[] array. * We wouldn't need this if there was a simple way to load PPC vector @@ -109,10 +212,9 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value) uint pos; for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) { if (ppc_builtin_constants[pos] == value) { - int offset_reg = ppc_allocate_register(gen->f); int offset = pos * 4; + int offset_reg = emit_li_offset(gen, offset); - ppc_li(gen->f, offset_reg, offset); /* Load 4-byte word into vector register. * The vector slot depends on the effective address we load from. * We know that our builtins start at a 16-byte boundary so we @@ -122,7 +224,6 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value) ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg); /* splat word[pos % 4] across the vector reg */ ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4); - ppc_release_register(gen->f, offset_reg); return; } } @@ -159,15 +260,15 @@ gen_get_bit31_vec(struct gen_context *gen) /** - * Register fetch, put result in 'dst_vec'. + * Register fetch. Return PPC vector register with result. */ -static void +static int emit_fetch(struct gen_context *gen, - unsigned dst_vec, const struct tgsi_full_src_register *reg, const unsigned chan_index) { uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index); + int dst_vec = -1; switch (swizzle) { case TGSI_EXTSWIZZLE_X: @@ -177,36 +278,46 @@ emit_fetch(struct gen_context *gen, switch (reg->SrcRegister.File) { case TGSI_FILE_INPUT: { - int offset_reg = ppc_allocate_register(gen->f); int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16; - ppc_li(gen->f, offset_reg, offset); + int offset_reg = emit_li_offset(gen, offset); + dst_vec = ppc_allocate_vec_register(gen->f); ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg); - ppc_release_register(gen->f, offset_reg); } break; case TGSI_FILE_TEMPORARY: - { - int offset_reg = ppc_allocate_register(gen->f); + if (reg->SrcRegister.Index < MAX_PPC_TEMPS) { + /* use PPC vec register */ + dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle]; + } + else { + /* use memory-based temp register "file" */ int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16; - ppc_li(gen->f, offset_reg, offset); + int offset_reg = emit_li_offset(gen, offset); + dst_vec = ppc_allocate_vec_register(gen->f); ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg); - ppc_release_register(gen->f, offset_reg); } break; case TGSI_FILE_IMMEDIATE: { - int offset_reg = ppc_allocate_register(gen->f); - int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16; - ppc_li(gen->f, offset_reg, offset); - ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg); - ppc_release_register(gen->f, offset_reg); + int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4; + int offset_reg = emit_li_offset(gen, offset); + dst_vec = ppc_allocate_vec_register(gen->f); + /* Load 4-byte word into vector register. + * The vector slot depends on the effective address we load from. + * We know that our immediates start at a 16-byte boundary so we + * know that 'swizzle' tells us which vector slot will have the + * loaded word. The other vector slots will be undefined. + */ + ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg); + /* splat word[swizzle] across the vector reg */ + ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle); } break; case TGSI_FILE_CONSTANT: { - int offset_reg = ppc_allocate_register(gen->f); int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4; - ppc_li(gen->f, offset_reg, offset); + int offset_reg = emit_li_offset(gen, offset); + dst_vec = ppc_allocate_vec_register(gen->f); /* Load 4-byte word into vector register. * The vector slot depends on the effective address we load from. * We know that our constants start at a 16-byte boundary so we @@ -216,7 +327,6 @@ emit_fetch(struct gen_context *gen, ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg); /* splat word[swizzle] across the vector reg */ ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle); - ppc_release_register(gen->f, offset_reg); } break; default: @@ -229,6 +339,7 @@ emit_fetch(struct gen_context *gen, case TGSI_EXTSWIZZLE_ONE: { int one_vec = gen_one_vec(gen); + dst_vec = ppc_allocate_vec_register(gen->f); ppc_vmove(gen->f, dst_vec, one_vec); } break; @@ -236,6 +347,8 @@ emit_fetch(struct gen_context *gen, assert( 0 ); } + assert(dst_vec >= 0); + { uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index); if (sign_op != TGSI_UTIL_SIGN_KEEP) { @@ -259,40 +372,148 @@ emit_fetch(struct gen_context *gen, } } } + + return dst_vec; } -#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \ - emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN ) +/** + * Test if two TGSI src registers refer to the same memory location. + * We use this to avoid redundant register loads. + */ +static boolean +equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a, + const struct tgsi_full_src_register *b, uint chan_b) +{ + int swz_a, swz_b; + int sign_a, sign_b; + if (a->SrcRegister.File != b->SrcRegister.File) + return FALSE; + if (a->SrcRegister.Index != b->SrcRegister.Index) + return FALSE; + swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a); + swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b); + if (swz_a != swz_b) + return FALSE; + sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a); + sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b); + if (sign_a != sign_b) + return FALSE; + return TRUE; +} + + +/** + * Given a TGSI src register and channel index, return the PPC vector + * register containing the value. We use a cache to prevent re-loading + * the same register multiple times. + * \return index of PPC vector register with the desired src operand + */ +static int +get_src_vec(struct gen_context *gen, + struct tgsi_full_instruction *inst, int src_reg, uint chan) +{ + const const struct tgsi_full_src_register *src = + &inst->FullSrcRegisters[src_reg]; + int vec; + uint i; + + /* check the cache */ + for (i = 0; i < gen->num_regs; i++) { + if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) { + /* cache hit */ + assert(gen->regs[i].vec >= 0); + return gen->regs[i].vec; + } + } + + /* cache miss: allocate new vec reg and emit fetch/load code */ + vec = emit_fetch(gen, src, chan); + gen->regs[gen->num_regs].src = *src; + gen->regs[gen->num_regs].chan = chan; + gen->regs[gen->num_regs].vec = vec; + gen->num_regs++; + + assert(gen->num_regs <= Elements(gen->regs)); + + assert(vec >= 0); + + return vec; +} + + +/** + * Clear the src operand cache. To be called at the end of each emit function. + */ +static void +release_src_vecs(struct gen_context *gen) +{ + uint i; + for (i = 0; i < gen->num_regs; i++) { + const const struct tgsi_full_src_register src = gen->regs[i].src; + if (!(src.SrcRegister.File == TGSI_FILE_TEMPORARY && + src.SrcRegister.Index < MAX_PPC_TEMPS)) { + ppc_release_vec_register(gen->f, gen->regs[i].vec); + } + } + gen->num_regs = 0; +} + + + +static int +get_dst_vec(struct gen_context *gen, + const struct tgsi_full_instruction *inst, + unsigned chan_index) +{ + const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0]; + + if (reg->DstRegister.File == TGSI_FILE_TEMPORARY && + reg->DstRegister.Index < MAX_PPC_TEMPS) { + int vec = gen->temps_map[reg->DstRegister.Index][chan_index]; + return vec; + } + else { + return ppc_allocate_vec_register(gen->f); + } +} + /** * Register store. Store 'src_vec' at location indicated by 'reg'. + * \param free_vec Should the src_vec be released when done? */ static void emit_store(struct gen_context *gen, - unsigned src_vec, - const struct tgsi_full_dst_register *reg, + int src_vec, const struct tgsi_full_instruction *inst, - unsigned chan_index) + unsigned chan_index, + boolean free_vec) { + const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0]; + switch (reg->DstRegister.File) { case TGSI_FILE_OUTPUT: { - int offset_reg = ppc_allocate_register(gen->f); int offset = (reg->DstRegister.Index * 4 + chan_index) * 16; - ppc_li(gen->f, offset_reg, offset); + int offset_reg = emit_li_offset(gen, offset); ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg); - ppc_release_register(gen->f, offset_reg); } break; case TGSI_FILE_TEMPORARY: - { - int offset_reg = ppc_allocate_register(gen->f); + if (reg->DstRegister.Index < MAX_PPC_TEMPS) { + if (!free_vec) { + int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index]; + if (dst_vec != src_vec) + ppc_vmove(gen->f, dst_vec, src_vec); + } + free_vec = FALSE; + } + else { int offset = (reg->DstRegister.Index * 4 + chan_index) * 16; - ppc_li(gen->f, offset_reg, offset); + int offset_reg = emit_li_offset(gen, offset); ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg); - ppc_release_register(gen->f, offset_reg); } break; #if 0 @@ -322,22 +543,20 @@ emit_store(struct gen_context *gen, break; } #endif -} - - -#define STORE( GEN, INST, XMM, INDEX, CHAN )\ - emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN ) + if (free_vec) + ppc_release_vec_register(gen->f, src_vec); +} static void emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst) { - int v0 = ppc_allocate_vec_register(gen->f); - int v1 = ppc_allocate_vec_register(gen->f); + int v0, v1; uint chan_index; - FETCH(gen, *inst, v0, 0, CHAN_X); + v0 = get_src_vec(gen, inst, 0, CHAN_X); + v1 = ppc_allocate_vec_register(gen->f); switch (inst->Instruction.Opcode) { case TGSI_OPCODE_RSQ: @@ -353,9 +572,10 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst) } FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE(gen, *inst, v1, 0, chan_index); + emit_store(gen, v1, inst, chan_index, FALSE); } - ppc_release_vec_register(gen->f, v0); + + release_src_vecs(gen); ppc_release_vec_register(gen->f, v1); } @@ -363,61 +583,65 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst) static void emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst) { - int v0 = ppc_allocate_vec_register(gen->f); uint chan_index; FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { - FETCH(gen, *inst, 0, 0, chan_index); /* v0 = srcreg[0] */ + int v0 = get_src_vec(gen, inst, 0, chan_index); /* v0 = srcreg[0] */ + int v1 = get_dst_vec(gen, inst, chan_index); switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ABS: /* turn off the most significant bit of each vector float word */ { - int v1 = ppc_allocate_vec_register(gen->f); - ppc_vspltisw(gen->f, v1, -1); /* v1 = {-1, -1, -1, -1} */ - ppc_vslw(gen->f, v1, v1, v1); /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */ - ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */ - ppc_release_vec_register(gen->f, v1); + int bit31_vec = gen_get_bit31_vec(gen); + ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */ } break; case TGSI_OPCODE_FLOOR: - ppc_vrfim(gen->f, v0, v0); /* v0 = floor(v0) */ + ppc_vrfim(gen->f, v1, v0); /* v1 = floor(v0) */ break; case TGSI_OPCODE_FRAC: - { - int v1 = ppc_allocate_vec_register(gen->f); - ppc_vrfim(gen->f, v1, v0); /* v1 = floor(v0) */ - ppc_vsubfp(gen->f, v0, v0, v1); /* v0 = v0 - v1 */ - ppc_release_vec_register(gen->f, v1); - } + ppc_vrfim(gen->f, v1, v0); /* tmp = floor(v0) */ + ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */ break; case TGSI_OPCODE_EXPBASE2: - ppc_vexptefp(gen->f, v0, v0); /* v0 = 2^v0 */ + ppc_vexptefp(gen->f, v1, v0); /* v1 = 2^v0 */ break; case TGSI_OPCODE_LOGBASE2: /* XXX this may be broken! */ - ppc_vlogefp(gen->f, v0, v0); /* v0 = log2(v0) */ + ppc_vlogefp(gen->f, v1, v0); /* v1 = log2(v0) */ break; case TGSI_OPCODE_MOV: - /* nothing */ + case TGSI_OPCODE_SWZ: + if (v0 != v1) + ppc_vmove(gen->f, v1, v0); break; default: assert(0); } - STORE(gen, *inst, v0, 0, chan_index); /* store v0 */ + emit_store(gen, v1, inst, chan_index, TRUE); /* store v0 */ } - ppc_release_vec_register(gen->f, v0); + + release_src_vecs(gen); } static void emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst) { - int v0 = ppc_allocate_vec_register(gen->f); - int v1 = ppc_allocate_vec_register(gen->f); - int v2 = ppc_allocate_vec_register(gen->f); - uint chan_index; - FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { - FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */ - FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */ + int zero_vec = -1; + uint chan; + + if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) { + zero_vec = ppc_allocate_vec_register(gen->f); + ppc_vzero(gen->f, zero_vec); + } + + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) { + /* fetch src operands */ + int v0 = get_src_vec(gen, inst, 0, chan); + int v1 = get_src_vec(gen, inst, 1, chan); + int v2 = get_dst_vec(gen, inst, chan); + + /* emit binop */ switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ADD: ppc_vaddfp(gen->f, v2, v0, v1); @@ -426,8 +650,7 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst) ppc_vsubfp(gen->f, v2, v0, v1); break; case TGSI_OPCODE_MUL: - ppc_vxor(gen->f, v2, v2, v2); /* v2 = {0, 0, 0, 0} */ - ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */ + ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec); break; case TGSI_OPCODE_MIN: ppc_vminfp(gen->f, v2, v0, v1); @@ -438,11 +661,48 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst) default: assert(0); } - STORE(gen, *inst, v2, 0, chan_index); /* store v2 */ + + /* store v2 */ + emit_store(gen, v2, inst, chan, TRUE); } - ppc_release_vec_register(gen->f, v0); - ppc_release_vec_register(gen->f, v1); - ppc_release_vec_register(gen->f, v2); + + if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) + ppc_release_vec_register(gen->f, zero_vec); + + release_src_vecs(gen); +} + + +static void +emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + uint chan; + + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) { + /* fetch src operands */ + int v0 = get_src_vec(gen, inst, 0, chan); + int v1 = get_src_vec(gen, inst, 1, chan); + int v2 = get_src_vec(gen, inst, 2, chan); + int v3 = get_dst_vec(gen, inst, chan); + + /* emit ALU */ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_MAD: + ppc_vmaddfp(gen->f, v3, v0, v1, v2); /* v3 = v0 * v1 + v2 */ + break; + case TGSI_OPCODE_LRP: + ppc_vsubfp(gen->f, v3, v1, v2); /* v3 = v1 - v2 */ + ppc_vmaddfp(gen->f, v3, v0, v3, v2); /* v3 = v0 * v3 + v2 */ + break; + default: + assert(0); + } + + /* store v3 */ + emit_store(gen, v3, inst, chan, TRUE); + } + + release_src_vecs(gen); } @@ -452,16 +712,15 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst) static void emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst) { - int v0 = ppc_allocate_vec_register(gen->f); - int v1 = ppc_allocate_vec_register(gen->f); - int v2 = ppc_allocate_vec_register(gen->f); - uint chan_index; - boolean complement = FALSE; + uint chan; int one_vec = gen_one_vec(gen); - FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { - FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */ - FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */ + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) { + /* fetch src operands */ + int v0 = get_src_vec(gen, inst, 0, chan); + int v1 = get_src_vec(gen, inst, 1, chan); + int v2 = get_dst_vec(gen, inst, chan); + boolean complement = FALSE; switch (inst->Instruction.Opcode) { case TGSI_OPCODE_SNE: @@ -495,89 +754,56 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst) else ppc_vand(gen->f, v2, one_vec, v2); /* v2 = one_vec & v2 */ - STORE(gen, *inst, v2, 0, chan_index); /* store v2 */ + /* store v2 */ + emit_store(gen, v2, inst, chan, TRUE); } - ppc_release_vec_register(gen->f, v0); - ppc_release_vec_register(gen->f, v1); - ppc_release_vec_register(gen->f, v2); + release_src_vecs(gen); } static void emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst) { - int v0 = ppc_allocate_vec_register(gen->f); - int v1 = ppc_allocate_vec_register(gen->f); - int v2 = ppc_allocate_vec_register(gen->f); + int v0, v1, v2; uint chan_index; + v2 = ppc_allocate_vec_register(gen->f); + ppc_vxor(gen->f, v2, v2, v2); /* v2 = {0, 0, 0, 0} */ - FETCH(gen, *inst, v0, 0, CHAN_X); /* v0 = src0.XXXX */ - FETCH(gen, *inst, v1, 1, CHAN_X); /* v1 = src1.XXXX */ + v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */ + v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ - FETCH(gen, *inst, v0, 0, CHAN_Y); /* v0 = src0.YYYY */ - FETCH(gen, *inst, v1, 1, CHAN_Y); /* v1 = src1.YYYY */ + v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */ + v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ - FETCH(gen, *inst, v0, 0, CHAN_Z); /* v0 = src0.ZZZZ */ - FETCH(gen, *inst, v1, 1, CHAN_Z); /* v1 = src1.ZZZZ */ + v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */ + v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) { - FETCH(gen, *inst, v0, 0, CHAN_W); /* v0 = src0.WWWW */ - FETCH(gen, *inst, v1, 1, CHAN_W); /* v1 = src1.WWWW */ - ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ + v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */ + v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */ + ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ } else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) { - FETCH(gen, *inst, v1, 1, CHAN_W); /* v1 = src1.WWWW */ - ppc_vaddfp(gen->f, v2, v2, v1); /* v2 = v2 + v1 */ + v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */ + ppc_vaddfp(gen->f, v2, v2, v1); /* v2 = v2 + v1 */ } FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { - STORE(gen, *inst, v2, 0, chan_index); /* store v2 */ + emit_store(gen, v2, inst, chan_index, FALSE); /* store v2, free v2 later */ } - ppc_release_vec_register(gen->f, v0); - ppc_release_vec_register(gen->f, v1); - ppc_release_vec_register(gen->f, v2); -} + release_src_vecs(gen); -static void -emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst) -{ - int v0 = ppc_allocate_vec_register(gen->f); - int v1 = ppc_allocate_vec_register(gen->f); - int v2 = ppc_allocate_vec_register(gen->f); - int v3 = ppc_allocate_vec_register(gen->f); - uint chan_index; - FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { - FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */ - FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */ - FETCH(gen, *inst, v2, 2, chan_index); /* v2 = srcreg[2] */ - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_MAD: - ppc_vmaddfp(gen->f, v3, v0, v1, v2); /* v3 = v0 * v1 + v2 */ - break; - case TGSI_OPCODE_LRP: - ppc_vsubfp(gen->f, v3, v1, v2); /* v3 = v1 - v2 */ - ppc_vmaddfp(gen->f, v3, v0, v3, v2); /* v3 = v0 * v3 + v2 */ - break; - default: - assert(0); - } - STORE(gen, *inst, v3, 0, chan_index); /* store v3 */ - } - ppc_release_vec_register(gen->f, v0); - ppc_release_vec_register(gen->f, v1); ppc_release_vec_register(gen->f, v2); - ppc_release_vec_register(gen->f, v3); } - /** Approximation for vr = pow(va, vb) */ static void ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb) @@ -604,43 +830,42 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst) /* Compute X */ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { - STORE(gen, *inst, one_vec, 0, CHAN_X); + emit_store(gen, one_vec, inst, CHAN_X, FALSE); } /* Compute Y, Z */ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) || IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { - int x_vec = ppc_allocate_vec_register(gen->f); + int x_vec; int zero_vec = ppc_allocate_vec_register(gen->f); - FETCH(gen, *inst, x_vec, 0, CHAN_X); /* x_vec = src[0].x */ + x_vec = get_src_vec(gen, inst, 0, CHAN_X); /* x_vec = src[0].x */ ppc_vzero(gen->f, zero_vec); /* zero = {0,0,0,0} */ ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { - STORE(gen, *inst, x_vec, 0, CHAN_Y); /* store Y */ + emit_store(gen, x_vec, inst, CHAN_Y, FALSE); } if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { - int y_vec = ppc_allocate_vec_register(gen->f); + int y_vec, w_vec; int z_vec = ppc_allocate_vec_register(gen->f); - int w_vec = ppc_allocate_vec_register(gen->f); int pow_vec = ppc_allocate_vec_register(gen->f); int pos_vec = ppc_allocate_vec_register(gen->f); int p128_vec = ppc_allocate_vec_register(gen->f); int n128_vec = ppc_allocate_vec_register(gen->f); - FETCH(gen, *inst, y_vec, 0, CHAN_Y); /* y_vec = src[0].y */ + y_vec = get_src_vec(gen, inst, 0, CHAN_Y); /* y_vec = src[0].y */ ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */ - FETCH(gen, *inst, w_vec, 0, CHAN_W); /* w_vec = src[0].w */ + w_vec = get_src_vec(gen, inst, 0, CHAN_W); /* w_vec = src[0].w */ - /* clamp Y to [-128, 128] */ + /* clamp W to [-128, 128] */ load_constant_vec(gen, p128_vec, 128.0f); load_constant_vec(gen, n128_vec, -128.0f); - ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */ - ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */ + ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */ + ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */ /* if temp.x > 0 * z = pow(tmp.y, tmp.w) @@ -651,34 +876,216 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst) ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */ ppc_vand(gen->f, z_vec, pow_vec, pos_vec); /* z = pow & pos */ - STORE(gen, *inst, z_vec, 0, CHAN_Z); /* store Z */ + emit_store(gen, z_vec, inst, CHAN_Z, FALSE); - ppc_release_vec_register(gen->f, y_vec); ppc_release_vec_register(gen->f, z_vec); - ppc_release_vec_register(gen->f, w_vec); ppc_release_vec_register(gen->f, pow_vec); ppc_release_vec_register(gen->f, pos_vec); ppc_release_vec_register(gen->f, p128_vec); ppc_release_vec_register(gen->f, n128_vec); } - ppc_release_vec_register(gen->f, x_vec); ppc_release_vec_register(gen->f, zero_vec); } /* Compute W */ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { - STORE(gen, *inst, one_vec, 0, CHAN_W); + emit_store(gen, one_vec, inst, CHAN_W, FALSE); + } + + release_src_vecs(gen); +} + + +static void +emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + const int one_vec = gen_one_vec(gen); + int src_vec; + + /* get src arg */ + src_vec = get_src_vec(gen, inst, 0, CHAN_X); + + /* Compute X = 2^floor(src) */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { + int dst_vec = get_dst_vec(gen, inst, CHAN_X); + int tmp_vec = ppc_allocate_vec_register(gen->f); + ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */ + ppc_vexptefp(gen->f, dst_vec, tmp_vec); /* dst = 2 ^ tmp */ + emit_store(gen, dst_vec, inst, CHAN_X, TRUE); + ppc_release_vec_register(gen->f, tmp_vec); + } + + /* Compute Y = src - floor(src) */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + int dst_vec = get_dst_vec(gen, inst, CHAN_Y); + int tmp_vec = ppc_allocate_vec_register(gen->f); + ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */ + ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec); /* dst = src - tmp */ + emit_store(gen, dst_vec, inst, CHAN_Y, TRUE); + ppc_release_vec_register(gen->f, tmp_vec); + } + + /* Compute Z = RoughApprox2ToX(src) */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + int dst_vec = get_dst_vec(gen, inst, CHAN_Z); + ppc_vexptefp(gen->f, dst_vec, src_vec); /* dst = 2 ^ src */ + emit_store(gen, dst_vec, inst, CHAN_Z, TRUE); + } + + /* Compute W = 1.0 */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { + emit_store(gen, one_vec, inst, CHAN_W, FALSE); + } + + release_src_vecs(gen); +} + + +static void +emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + const int bit31_vec = gen_get_bit31_vec(gen); + const int one_vec = gen_one_vec(gen); + int src_vec, abs_vec; + + /* get src arg */ + src_vec = get_src_vec(gen, inst, 0, CHAN_X); + + /* compute abs(src) */ + abs_vec = ppc_allocate_vec_register(gen->f); + ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec); /* abs = src & ~bit31 */ + + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + + /* compute tmp = floor(log2(abs)) */ + int tmp_vec = ppc_allocate_vec_register(gen->f); + ppc_vlogefp(gen->f, tmp_vec, abs_vec); /* tmp = log2(abs) */ + ppc_vrfim(gen->f, tmp_vec, tmp_vec); /* tmp = floor(tmp); */ + + /* Compute X = tmp */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { + emit_store(gen, tmp_vec, inst, CHAN_X, FALSE); + } + + /* Compute Y = abs / 2^tmp */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + const int zero_vec = ppc_allocate_vec_register(gen->f); + ppc_vzero(gen->f, zero_vec); + ppc_vexptefp(gen->f, tmp_vec, tmp_vec); /* tmp = 2 ^ tmp */ + ppc_vrefp(gen->f, tmp_vec, tmp_vec); /* tmp = 1 / tmp */ + /* tmp = abs * tmp + zero */ + ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec); + emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE); + ppc_release_vec_register(gen->f, zero_vec); + } + + ppc_release_vec_register(gen->f, tmp_vec); + } + + /* Compute Z = RoughApproxLog2(abs) */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + int dst_vec = get_dst_vec(gen, inst, CHAN_Z); + ppc_vlogefp(gen->f, dst_vec, abs_vec); /* dst = log2(abs) */ + emit_store(gen, dst_vec, inst, CHAN_Z, TRUE); + } + + /* Compute W = 1.0 */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { + emit_store(gen, one_vec, inst, CHAN_W, FALSE); } + + ppc_release_vec_register(gen->f, abs_vec); + release_src_vecs(gen); +} + + +static void +emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int s0_vec = get_src_vec(gen, inst, 0, CHAN_X); + int s1_vec = get_src_vec(gen, inst, 1, CHAN_X); + int pow_vec = ppc_allocate_vec_register(gen->f); + int chan; + + ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec); + + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) { + emit_store(gen, pow_vec, inst, chan, FALSE); + } + + ppc_release_vec_register(gen->f, pow_vec); + + release_src_vecs(gen); } +static void +emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int x0_vec, y0_vec, z0_vec; + int x1_vec, y1_vec, z1_vec; + int zero_vec, tmp_vec; + int tmp2_vec; + + zero_vec = ppc_allocate_vec_register(gen->f); + ppc_vzero(gen->f, zero_vec); + + tmp_vec = ppc_allocate_vec_register(gen->f); + tmp2_vec = ppc_allocate_vec_register(gen->f); + + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + x0_vec = get_src_vec(gen, inst, 0, CHAN_X); + x1_vec = get_src_vec(gen, inst, 1, CHAN_X); + } + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + y0_vec = get_src_vec(gen, inst, 0, CHAN_Y); + y1_vec = get_src_vec(gen, inst, 1, CHAN_Y); + } + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + z0_vec = get_src_vec(gen, inst, 0, CHAN_Z); + z1_vec = get_src_vec(gen, inst, 1, CHAN_Z); + } + + IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) { + /* tmp = y0 * z1 */ + ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec); + /* tmp = tmp - z0 * y1*/ + ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec); + emit_store(gen, tmp_vec, inst, CHAN_X, FALSE); + } + IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) { + /* tmp = z0 * x1 */ + ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec); + /* tmp = tmp - x0 * z1 */ + ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec); + emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE); + } + IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) { + /* tmp = x0 * y1 */ + ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec); + /* tmp = tmp - y0 * x1 */ + ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec); + emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE); + } + /* W is undefined */ + + ppc_release_vec_register(gen->f, tmp_vec); + ppc_release_vec_register(gen->f, zero_vec); + release_src_vecs(gen); +} + static int emit_instruction(struct gen_context *gen, struct tgsi_full_instruction *inst) { switch (inst->Instruction.Opcode) { case TGSI_OPCODE_MOV: + case TGSI_OPCODE_SWZ: case TGSI_OPCODE_ABS: case TGSI_OPCODE_FLOOR: case TGSI_OPCODE_FRAC: @@ -717,17 +1124,28 @@ emit_instruction(struct gen_context *gen, case TGSI_OPCODE_LIT: emit_lit(gen, inst); break; + case TGSI_OPCODE_LOG: + emit_log(gen, inst); + break; + case TGSI_OPCODE_EXP: + emit_exp(gen, inst); + break; + case TGSI_OPCODE_POW: + emit_pow(gen, inst); + break; + case TGSI_OPCODE_XPD: + emit_xpd(gen, inst); + break; case TGSI_OPCODE_END: /* normal end */ return 1; default: return 0; } - - return 1; } + static void emit_declaration( struct ppc_function *func, @@ -805,6 +1223,7 @@ emit_epilogue(struct ppc_function *func) { ppc_return(func); /* XXX restore prev stack frame */ + debug_printf("PPC: Emitted %u instructions\n", func->num_inst); } @@ -837,17 +1256,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens, if (!use_ppc_asm) return FALSE; + if (0) { + debug_printf("\n********* TGSI->PPC ********\n"); + tgsi_dump(tokens, 0); + } + util_init_math(); - gen.f = func; - gen.inputs_reg = ppc_reserve_register(func, 3); /* first function param */ - gen.outputs_reg = ppc_reserve_register(func, 4); /* second function param */ - gen.temps_reg = ppc_reserve_register(func, 5); /* ... */ - gen.immed_reg = ppc_reserve_register(func, 6); - gen.const_reg = ppc_reserve_register(func, 7); - gen.builtins_reg = ppc_reserve_register(func, 8); - gen.one_vec = -1; - gen.bit31_vec = -1; + init_gen_context(&gen, func); emit_prologue(func); @@ -878,19 +1294,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens, /* splat each immediate component into a float[4] vector for SoA */ { const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1; - float *imm = (float *) immediates; uint i; assert(size <= 4); assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES); for (i = 0; i < size; i++) { - const float value = - parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float; - imm[num_immediates * 4 + 0] = - imm[num_immediates * 4 + 1] = - imm[num_immediates * 4 + 2] = - imm[num_immediates * 4 + 3] = value; - num_immediates++; + immediates[num_immediates][i] = + parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float; } + num_immediates++; } break; @@ -904,6 +1315,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens, tgsi_parse_free( &parse ); + if (ppc_num_instructions(func) == 0) { + /* ran out of memory for instructions */ + ok = FALSE; + } + + if (!ok) + debug_printf("TGSI->PPC translation failed\n"); + return ok; } diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c index 11659247c0..bc7b941b78 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c @@ -153,17 +153,21 @@ check_register_usage( if (!check_file_name( ctx, file )) return FALSE; - if (index < 0 || index > MAX_REGISTERS) { - report_error( ctx, "%s[%i]: Invalid index %s", file_names[file], index, name ); - return FALSE; - } - if (indirect_access) { + /* Note that 'index' is an offset relative to the value of the + * address register. No range checking done here. + */ if (!is_any_register_declared( ctx, file )) report_error( ctx, "%s: Undeclared %s register", file_names[file], name ); ctx->regs_ind_used[file] = TRUE; } else { + if (index < 0 || index > MAX_REGISTERS) { + report_error( ctx, "%s[%i]: Invalid index %s", + file_names[file], index, name ); + return FALSE; + } + if (!is_register_declared( ctx, file, index )) report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name ); ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG)); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index f79170b9d6..f93db18114 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -27,12 +27,14 @@ #include "pipe/p_config.h" -#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) +#if defined(PIPE_ARCH_X86) #include "pipe/p_debug.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#if defined(PIPE_ARCH_SSE) #include "util/u_sse.h" +#endif #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" @@ -72,6 +74,9 @@ #define TEMP_R0 TGSI_EXEC_TEMP_R0 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR +#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I +#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C + /** * X86 utility functions. @@ -233,6 +238,9 @@ emit_const( int indirectIndex ) { if (indirect) { + /* 'vec' is the offset from the address register's value. + * We're loading CONST[ADDR+vec] into an xmm register. + */ struct x86_reg r0 = get_input_base(); struct x86_reg r1 = get_output_base(); uint i; @@ -243,18 +251,40 @@ emit_const( x86_push( func, r0 ); x86_push( func, r1 ); + /* + * Loop over the four pixels or vertices in the quad. + * Get the value of the address (offset) register for pixel/vertex[i], + * add it to the src offset and index into the constant buffer. + * Note that we're working on SOA data. + * If any of the pixel/vertex execution channels are unused their + * values will be garbage. It's very important that we don't use + * those garbage values as indexes into the constant buffer since + * that'll cause segfaults. + * The solution is to bitwise-AND the offset with the execution mask + * register whose values are either 0 or ~0. + * The caller must setup the execution mask register to indicate + * which channels are valid/alive before running the shader. + * The execution mask will also figure into loops and conditionals + * someday. + */ for (i = 0; i < QUAD_SIZE; i++) { - x86_lea( func, r0, get_const( vec, chan ) ); + /* r1 = address register[i] */ x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) ); + /* r0 = execution mask[i] */ + x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) ); + /* r1 = r1 & r0 */ + x86_and( func, r1, r0 ); + /* r0 = 'vec', the offset */ + x86_lea( func, r0, get_const( vec, chan ) ); - /* Quick hack to multiply by 16 -- need to add SHL to rtasm. + /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm. */ x86_add( func, r1, r1 ); x86_add( func, r1, r1 ); x86_add( func, r1, r1 ); x86_add( func, r1, r1 ); - x86_add( func, r0, r1 ); + x86_add( func, r0, r1 ); /* r0 = r0 + r1 */ x86_mov( func, r1, x86_deref( r0 ) ); x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 ); } @@ -268,6 +298,7 @@ emit_const( get_temp( TEMP_R0, CHAN_X ) ); } else { + /* 'vec' is the index into the src register file, such as TEMP[vec] */ assert( vec >= 0 ); sse_movss( @@ -598,6 +629,9 @@ emit_func_call_dst_src( code ); } + +#if defined(PIPE_ARCH_SSE) + /* * Fast SSE2 implementation of special math functions. */ @@ -649,6 +683,7 @@ exp2f4(__m128 x) return _mm_mul_ps(expipart, expfpart); } + /** * See http://www.devmaster.net/forums/showthread.php?p=43580 */ @@ -691,12 +726,16 @@ log2f4(__m128 x) return _mm_add_ps(logmant, exp); } + static INLINE __m128 powf4(__m128 x, __m128 y) { return exp2f4(_mm_mul_ps(log2f4(x), y)); } +#endif /* PIPE_ARCH_SSE */ + + /** * Low-level instruction translators. @@ -751,13 +790,20 @@ emit_cos( } static void PIPE_CDECL -#if defined(PIPE_CC_GCC) +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) __attribute__((force_align_arg_pointer)) #endif ex24f( float *store ) { +#if defined(PIPE_ARCH_SSE) _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) )); +#else + store[0] = util_fast_exp2( store[0] ); + store[1] = util_fast_exp2( store[1] ); + store[2] = util_fast_exp2( store[2] ); + store[3] = util_fast_exp2( store[3] ); +#endif } static void @@ -784,6 +830,17 @@ emit_f2it( make_xmm( xmm ) ); } +static void +emit_i2f( + struct x86_function *func, + unsigned xmm ) +{ + sse2_cvtdq2ps( + func, + make_xmm( xmm ), + make_xmm( xmm ) ); +} + static void PIPE_CDECL flr4f( float *store ) @@ -831,13 +888,20 @@ emit_frc( } static void PIPE_CDECL -#if defined(PIPE_CC_GCC) +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) __attribute__((force_align_arg_pointer)) #endif lg24f( float *store ) { +#if defined(PIPE_ARCH_SSE) _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) )); +#else + store[0] = util_fast_log2( store[0] ); + store[1] = util_fast_log2( store[1] ); + store[2] = util_fast_log2( store[2] ); + store[3] = util_fast_log2( store[3] ); +#endif } static void @@ -890,19 +954,19 @@ emit_neg( } static void PIPE_CDECL -#if defined(PIPE_CC_GCC) +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) __attribute__((force_align_arg_pointer)) #endif pow4f( float *store ) { -#if 1 +#if defined(PIPE_ARCH_SSE) _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) )); #else - store[0] = powf( store[0], store[4] ); - store[1] = powf( store[1], store[5] ); - store[2] = powf( store[2], store[6] ); - store[3] = powf( store[3], store[7] ); + store[0] = util_fast_pow( store[0], store[4] ); + store[1] = util_fast_pow( store[1], store[5] ); + store[2] = util_fast_pow( store[2], store[6] ); + store[3] = util_fast_pow( store[3], store[7] ); #endif } @@ -1702,7 +1766,18 @@ emit_instruction( case TGSI_OPCODE_DOT2ADD: /* TGSI_OPCODE_DP2A */ - return 0; + FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ + FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ + emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ + FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ + FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ + emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ + emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ + FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */ + emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ + } break; case TGSI_OPCODE_INDEX: @@ -2036,7 +2111,39 @@ emit_instruction( break; case TGSI_OPCODE_NRM: - return 0; + /* fall-through */ + case TGSI_OPCODE_NRM4: + /* 3 or 4-component normalization */ + { + uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4; + /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */ + FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */ + FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */ + FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */ + if (dims == 4) { + FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */ + } + emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */ + emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */ + emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */ + emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */ + emit_add( func, 0, 1 ); /* xmm0 += xmm1 */ + emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */ + emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */ + emit_add( func, 0, 1 ); /* xmm0 += xmm1 */ + if (dims == 4) { + emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */ + emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */ + emit_add( func, 0, 0 ); /* xmm0 += xmm1 */ + } + emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + if (chan_index < dims) { + emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */ + STORE( func, *inst, 4+chan_index, 0, chan_index ); + } + } + } break; case TGSI_OPCODE_DIV: @@ -2044,7 +2151,16 @@ emit_instruction( break; case TGSI_OPCODE_DP2: - return 0; + FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ + FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ + emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ + FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ + FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ + emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ + emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ + } break; case TGSI_OPCODE_TXL: @@ -2104,7 +2220,12 @@ emit_instruction( break; case TGSI_OPCODE_TRUNC: - return 0; + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_f2it( func, 0 ); + emit_i2f( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } break; case TGSI_OPCODE_SHL: diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c index 3ed8bdfdf3..a1a51d7ef2 100644 --- a/src/gallium/auxiliary/util/p_debug.c +++ b/src/gallium/auxiliary/util/p_debug.c @@ -36,6 +36,13 @@ #include <windows.h> #include <winddi.h> +#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) + +#include <stdio.h> +#include <stdlib.h> +#include <windows.h> +#include <types.h> + #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) #ifndef WIN32_LEAN_AND_MEAN @@ -98,7 +105,35 @@ void _debug_vprintf(const char *format, va_list ap) OutputDebugStringA(buf); buf[0] = '\0'; } -#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) +#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) + wchar_t *wide_format; + long wide_str_len; + char buf[512]; + int ret; +#if (_WIN32_WCE < 600) + ret = vsprintf(buf, format, ap); + if(ret < 0){ + sprintf(buf, "Cant handle debug print!"); + ret = 25; + } +#else + ret = vsprintf_s(buf, 512, format, ap); + if(ret < 0){ + sprintf_s(buf, 512, "Cant handle debug print!"); + ret = 25; + } +#endif + buf[ret] = '\0'; + /* Format is ascii - needs to be converted to wchar_t for printing */ + wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0); + wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t)); + if (wide_format) { + MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, + wide_format, wide_str_len); + NKDbgPrintfW(wide_format, wide_format); + free(wide_format); + } +#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) /* TODO */ #else /* !PIPE_SUBSYSTEM_WINDOWS */ vfprintf(stderr, format, ap); @@ -637,6 +672,7 @@ void debug_dump_surface_bmp(const char *filename, struct pipe_surface *surface) { +#ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT struct util_stream *stream; unsigned surface_usage; struct bmp_file_header bmfh; @@ -703,6 +739,7 @@ error2: FREE(rgba); error1: ; +#endif } #endif diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index be7303e550..d2eaa2e7f7 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -68,7 +68,7 @@ __inline double ceil(double val) return ceil_val; } -#ifndef PIPE_SUBSYSTEM_WINDOWS_CE +#ifndef PIPE_SUBSYSTEM_WINDOWS_CE_OGL __inline double floor(double val) { double floor_val; diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c index 0f51dd5977..45ce257b5e 100644 --- a/src/gallium/auxiliary/util/u_mm.c +++ b/src/gallium/auxiliary/util/u_mm.c @@ -31,7 +31,7 @@ void -mmDumpMemInfo(const struct mem_block *heap) +u_mmDumpMemInfo(const struct mem_block *heap) { debug_printf("Memory heap %p:\n", (void *)heap); if (heap == 0) { @@ -58,7 +58,7 @@ mmDumpMemInfo(const struct mem_block *heap) } struct mem_block * -mmInit(int ofs, int size) +u_mmInit(int ofs, int size) { struct mem_block *heap, *block; @@ -165,13 +165,17 @@ SliceBlock(struct mem_block *p, struct mem_block * -mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch) +u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch) { struct mem_block *p; const int mask = (1 << align2)-1; int startofs = 0; int endofs; + assert(size >= 0); + assert(align2 >= 0); + assert(align2 <= 12); /* sanity check, 2^12 (4KB) enough? */ + if (!heap || align2 < 0 || size <= 0) return NULL; @@ -198,7 +202,7 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch) struct mem_block * -mmFindBlock(struct mem_block *heap, int start) +u_mmFindBlock(struct mem_block *heap, int start) { struct mem_block *p; @@ -237,7 +241,7 @@ Join2Blocks(struct mem_block *p) } int -mmFreeMem(struct mem_block *b) +u_mmFreeMem(struct mem_block *b) { if (!b) return 0; @@ -266,7 +270,7 @@ mmFreeMem(struct mem_block *b) void -mmDestroy(struct mem_block *heap) +u_mmDestroy(struct mem_block *heap) { struct mem_block *p; diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h index b226b101cb..ce20e48763 100644 --- a/src/gallium/auxiliary/util/u_mm.h +++ b/src/gallium/auxiliary/util/u_mm.h @@ -49,7 +49,7 @@ struct mem_block { * input: total size in bytes * return: a heap pointer if OK, NULL if error */ -extern struct mem_block *mmInit(int ofs, int size); +extern struct mem_block *u_mmInit(int ofs, int size); /** * Allocate 'size' bytes with 2^align2 bytes alignment, @@ -61,7 +61,7 @@ extern struct mem_block *mmInit(int ofs, int size); * startSearch = linear offset from start of heap to begin search * return: pointer to the allocated block, 0 if error */ -extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, +extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch); /** @@ -69,23 +69,23 @@ extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2 * input: pointer to a block * return: 0 if OK, -1 if error */ -extern int mmFreeMem(struct mem_block *b); +extern int u_mmFreeMem(struct mem_block *b); /** * Free block starts at offset * input: pointer to a heap, start offset * return: pointer to a block */ -extern struct mem_block *mmFindBlock(struct mem_block *heap, int start); +extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start); /** * destroy MM */ -extern void mmDestroy(struct mem_block *mmInit); +extern void u_mmDestroy(struct mem_block *mmInit); /** * For debuging purpose. */ -extern void mmDumpMemInfo(const struct mem_block *mmInit); +extern void u_mmDumpMemInfo(const struct mem_block *mmInit); #endif diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c index f5619ef791..30f32413d7 100644 --- a/src/gallium/auxiliary/util/u_rect.c +++ b/src/gallium/auxiliary/util/u_rect.c @@ -222,7 +222,8 @@ util_surface_copy(struct pipe_context *pipe, w, h, src_map, do_flip ? -(int) src->stride : src->stride, - src_x, src_y); + src_x, + do_flip ? w - src_y : src_y); } pipe->screen->surface_unmap(pipe->screen, src); diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c index 853c503f4f..32f6b072a0 100644 --- a/src/gallium/auxiliary/util/u_tile.c +++ b/src/gallium/auxiliary/util/u_tile.c @@ -460,7 +460,7 @@ l8_put_tile_rgba(ubyte *dst, for (j = 0; j < w; j++, pRow += 4) { unsigned r; r = float_to_ubyte(pRow[0]); - *dst++ = r; + *dst++ = (ubyte) r; } p += src_stride; } @@ -504,7 +504,7 @@ a8_put_tile_rgba(ubyte *dst, for (j = 0; j < w; j++, pRow += 4) { unsigned a; a = float_to_ubyte(pRow[3]); - *dst++ = a; + *dst++ = (ubyte) a; } p += src_stride; } @@ -634,7 +634,7 @@ i8_put_tile_rgba(ubyte *dst, for (j = 0; j < w; j++, pRow += 4) { unsigned r; r = float_to_ubyte(pRow[0]); - *dst++ = r; + *dst++ = (ubyte) r; } p += src_stride; } @@ -769,6 +769,32 @@ z24s8_get_tile_rgba(const unsigned *src, } +/*** PIPE_FORMAT_Z32_FLOAT ***/ + +/** + * Return each Z value as four floats in [0,1]. + */ +static void +z32f_get_tile_rgba(const float *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = *src++; + } + p += dst_stride; + } +} + + /*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/ /** @@ -913,6 +939,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format, case PIPE_FORMAT_Z24S8_UNORM: z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); break; + case PIPE_FORMAT_Z32_FLOAT: + z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride); + break; case PIPE_FORMAT_YCBCR: ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE); break; diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c index bf7d1d1c8d..57b80e5604 100644 --- a/src/gallium/auxiliary/util/u_time.c +++ b/src/gallium/auxiliary/util/u_time.c @@ -200,7 +200,7 @@ util_time_timeout(const struct util_time *start, } -#if defined(PIPE_SUBSYSYEM_WINDOWS_DISPLAY) +#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) void util_time_sleep(unsigned usecs) { LONGLONG start, curr, end; diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 23fb0b0831..87488ea2d7 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -122,7 +122,7 @@ #define CELL_DEBUG_CACHE (1 << 6) /** Max instructions for doing per-fragment operations */ -#define SPU_MAX_FRAGMENT_OPS_INSTS 64 +#define SPU_MAX_FRAGMENT_OPS_INSTS 128 diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c index 448b723d85..962775cd33 100644 --- a/src/gallium/drivers/cell/ppu/cell_batch.c +++ b/src/gallium/drivers/cell/ppu/cell_batch.c @@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell) const uint batch = cell->cur_batch; const uint size = cell->buffer_size[batch]; struct cell_command_fence *fence_cmd; + struct cell_fence *fence = &cell->fenced_buffers[batch].fence; + uint i; + + /* set fence status to emitted, not yet signalled */ + for (i = 0; i < cell->num_spus; i++) { + fence->status[i][0] = CELL_FENCE_EMITTED; + } ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE); fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size); fence_cmd->opcode = CELL_CMD_FENCE; - fence_cmd->fence = &cell->fenced_buffers[batch].fence; + fence_cmd->fence = fence; + + /* update batch buffer size */ + cell->buffer_size[batch] = size + sizeof(struct cell_command_fence); + assert(sizeof(struct cell_command_fence) % 8 == 0); } @@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell) { static boolean flushing = FALSE; uint batch = cell->cur_batch; - const uint size = cell->buffer_size[batch]; + uint size = cell->buffer_size[batch]; uint spu, cmd_word; assert(!flushing); @@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell) /* Before we use this batch buffer, make sure any fenced texture buffers * are released. */ - if (cell->fenced_buffers[batch].head) + if (cell->fenced_buffers[batch].head) { emit_fence(cell); + size = cell->buffer_size[batch]; + } flushing = TRUE; diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c index c9c0c721bb..037635e466 100644 --- a/src/gallium/drivers/cell/ppu/cell_clear.c +++ b/src/gallium/drivers/cell/ppu/cell_clear.c @@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, clr->surface = surfIndex; clr->value = clearValue; } + + /* Technically, the surface's contents are now known and cleared, + * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But + * it turns out it's quite painful to recognize when any particular + * surface goes from PIPE_SURFACE_STATUS_CLEAR to + * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because + * the drawing commands could be operating on numerous draw buffers, + * which we'd have to iterate through to set all their stati... + * For now, we cheat a bit and set the surface's status to DEFINED + * right here. Later we should revisit this and set the status to + * CLEAR here, and find a better place to set the status to DEFINED. + */ + ps->status = PIPE_SURFACE_STATUS_DEFINED; } diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 4491ae8cdf..eb1397bb3f 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -89,7 +89,7 @@ struct cell_buffer_node; */ struct cell_buffer_list { - struct cell_fence fence; + struct cell_fence fence ALIGN16_ATTRIB; struct cell_buffer_node *head; }; diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c index ffb3bea12b..867b5dcaa0 100644 --- a/src/gallium/drivers/cell/ppu/cell_fence.c +++ b/src/gallium/drivers/cell/ppu/cell_fence.c @@ -38,6 +38,7 @@ void cell_fence_init(struct cell_fence *fence) { uint i; + ASSERT_ALIGN16(fence->status); for (i = 0; i < CELL_MAX_SPUS; i++) { fence->status[i][0] = CELL_FENCE_IDLE; } @@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell, { uint i; for (i = 0; i < cell->num_spus; i++) { - //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE); - if (fence->status[i][0] == CELL_FENCE_EMITTED) + if (fence->status[i][0] != CELL_FENCE_SIGNALLED) return FALSE; + /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/ } return TRUE; } @@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell, while (!cell_fence_signalled(cell, fence)) { usleep(10); } + +#ifdef DEBUG + { + uint i; + for (i = 0; i < cell->num_spus; i++) { + assert(fence->status[i][0] == CELL_FENCE_SIGNALLED); + } + } +#endif } diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c index 6596b72010..a64967b4b9 100644 --- a/src/gallium/drivers/cell/ppu/cell_flush.c +++ b/src/gallium/drivers/cell/ppu/cell_flush.c @@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags, flags |= CELL_FLUSH_WAIT; } - if (flags & PIPE_FLUSH_SWAPBUFFERS) + if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE)) flags |= CELL_FLUSH_WAIT; draw_flush( cell->draw ); diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index d4d644d6e8..5c41b264ac 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -1303,60 +1303,91 @@ lookup_function(struct cell_context *cell, const char *funcname) /** * Emit code to call a SPU function. * Used to implement instructions like SIN/COS/POW/TEX/etc. + * If scalar, only the X components of the src regs are used, and the + * result is replicated across the dest register's XYZW components. */ static boolean emit_function_call(struct codegen *gen, const struct tgsi_full_instruction *inst, - char *funcname, uint num_args) + char *funcname, uint num_args, boolean scalar) { const uint addr = lookup_function(gen->cell, funcname); char comment[100]; - int ch; + int s_regs[3]; + int func_called = FALSE; + uint a, ch; + int retval_reg = -1; assert(num_args <= 3); snprintf(comment, sizeof(comment), "CALL %s:", funcname); spe_comment(gen->f, -4, comment); + if (scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]); + } + /* we'll call the function, put the return value in this register, + * then replicate it across all write-enabled components in d_reg. + */ + retval_reg = spe_allocate_available_register(gen->f); + } + for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s_regs[3], d_reg; + int d_reg; ubyte usedRegs[SPE_NUM_REGS]; - uint a, i, numUsed; + uint i, numUsed; - for (a = 0; a < num_args; a++) { - s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + if (!scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + } } - d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - numUsed = spe_get_registers_used(gen->f, usedRegs); - assert(numUsed < gen->frame_size / 16 - 2); + d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* save registers to stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - int offset = 2 + i; - spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); - } + if (!scalar || !func_called) { + /* for a scalar function, we'll really only call the function once */ - /* setup function arguments */ - for (a = 0; a < num_args; a++) { - spe_move(gen->f, 3 + a, s_regs[a]); - } + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); - /* branch to function, save return addr */ - spe_brasl(gen->f, SPE_REG_RA, addr); + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } - /* save function's return value */ - spe_move(gen->f, d_reg, 3); + /* setup function arguments */ + for (a = 0; a < num_args; a++) { + spe_move(gen->f, 3 + a, s_regs[a]); + } - /* restore registers from stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - if (reg != d_reg) { - int offset = 2 + i; - spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); + + /* save function's return value */ + if (scalar) + spe_move(gen->f, retval_reg, 3); + else + spe_move(gen->f, d_reg, 3); + + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_reg && reg != retval_reg) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } } + + func_called = TRUE; + } + + if (scalar) { + spe_move(gen->f, d_reg, retval_reg); } store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); @@ -1364,6 +1395,10 @@ emit_function_call(struct codegen *gen, } } + if (scalar) { + spe_release_register(gen->f, retval_reg); + } + return true; } @@ -1770,15 +1805,15 @@ emit_instruction(struct codegen *gen, return emit_END(gen); case TGSI_OPCODE_COS: - return emit_function_call(gen, inst, "spu_cos", 1); + return emit_function_call(gen, inst, "spu_cos", 1, TRUE); case TGSI_OPCODE_SIN: - return emit_function_call(gen, inst, "spu_sin", 1); + return emit_function_call(gen, inst, "spu_sin", 1, TRUE); case TGSI_OPCODE_POW: - return emit_function_call(gen, inst, "spu_pow", 2); + return emit_function_call(gen, inst, "spu_pow", 2, TRUE); case TGSI_OPCODE_EXPBASE2: - return emit_function_call(gen, inst, "spu_exp2", 1); + return emit_function_call(gen, inst, "spu_exp2", 1, TRUE); case TGSI_OPCODE_LOGBASE2: - return emit_function_call(gen, inst, "spu_log2", 1); + return emit_function_call(gen, inst, "spu_log2", 1, TRUE); case TGSI_OPCODE_TEX: /* fall-through for now */ case TGSI_OPCODE_TXD: diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 4e1e53ecdc..d9c3ff3f4d 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1141,13 +1141,17 @@ gen_colormask(struct spe_function *f, * access to the Compare Immediate instructions where we don't in * gen_depth_test(), which is what makes us very different. * + * There's some added complexity if there's a non-trivial state->mask + * value; then stencil and reference both must be masked + * * The return value in the stencil_pass_reg is a bitmask of valid * fragments that also passed the stencil test. The bitmask of valid - * fragments that failed would be found in (mask_reg & ~stencil_pass_reg). + * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg). */ static void gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, - unsigned int mask_reg, unsigned int fbS_reg, + unsigned int stencil_max_value, + unsigned int fragment_mask_reg, unsigned int fbS_reg, unsigned int stencil_pass_reg) { /* Generate code that puts the set of passing fragments into the stencil_pass_reg @@ -1155,68 +1159,134 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, */ switch (state->func) { case PIPE_FUNC_EQUAL: - /* stencil_pass = mask & (s == reference) */ - spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; case PIPE_FUNC_NOTEQUAL: - /* stencil_pass = mask & ~(s == reference) */ - spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & ~(s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; case PIPE_FUNC_GREATER: - /* stencil_pass = mask & (s > reference) */ - spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - case PIPE_FUNC_LESS: { - /* stencil_pass = mask & (reference > s) */ - /* There's no convenient Compare Less Than Immediate instruction, so - * we'll have to do this one the harder way, by loading a register and - * comparing directly. Compare Logical Greater Than Word (clgt) - * treats its operands as unsigned - no sign extension. - */ - unsigned int tmp_reg = spe_allocate_available_register(f); - spe_load_uint(f, tmp_reg, state->ref_value); - spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); - spe_release_register(f, tmp_reg); + case PIPE_FUNC_LESS: + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (reference > s) */ + /* There's no convenient Compare Less Than Immediate instruction, so + * we'll have to do this one the harder way, by loading a register and + * comparing directly. Compare Logical Greater Than Word (clgt) + * treats its operands as unsigned - no sign extension. + */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */ + unsigned int tmp_reg = spe_allocate_available_register(f); + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - } case PIPE_FUNC_LEQUAL: - /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */ - spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s <= reference) + * = fragment_mask & ~(s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - case PIPE_FUNC_GEQUAL: { - /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */ - /* As above, we have to do this by loading a register */ - unsigned int tmp_reg = spe_allocate_available_register(f); - spe_load_uint(f, tmp_reg, state->ref_value); - spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); - spe_release_register(f, tmp_reg); + case PIPE_FUNC_GEQUAL: + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s >= reference) ] + * = fragment_mask & ~(reference > s) */ + /* As above, we have to do this by loading a register */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */ + unsigned int tmp_reg = spe_allocate_available_register(f); + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - } case PIPE_FUNC_NEVER: - /* stencil_pass = mask & 0 = 0 */ + /* stencil_pass = fragment_mask & 0 = 0 */ spe_load_uint(f, stencil_pass_reg, 0); break; case PIPE_FUNC_ALWAYS: - /* stencil_pass = mask & 1 = mask */ - spe_move(f, stencil_pass_reg, mask_reg); + /* stencil_pass = fragment_mask & 1 = fragment_mask */ + spe_move(f, stencil_pass_reg, fragment_mask_reg); break; } /* The fragments that passed the stencil test are now in stencil_pass_reg. - * The fragments that failed would be (mask_reg & ~stencil_pass_reg). + * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg). */ } @@ -1282,7 +1352,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op, /* Add Word Immediate computes rT = rA + 10-bit signed immediate */ spe_ai(f, newS_reg, fbS_reg, 1); /* Select from the current value or the new value based on the equality test */ - spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); spe_release_register(f, equals_reg); break; @@ -1295,7 +1365,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op, /* Add Word Immediate with a (-1) value works */ spe_ai(f, newS_reg, fbS_reg, -1); /* Select from the current value or the new value based on the equality test */ - spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); spe_release_register(f, equals_reg); break; @@ -1534,15 +1604,28 @@ gen_stencil_depth_test(struct spe_function *f, * meaning that we have to calculate the stencil values but do not * need to mask them), we can avoid generating code. Don't forget * that we need to consider backfacing stencil, if enabled. + * + * Note that if the backface stencil is *not* enabled, the backface + * stencil will have the same values as the frontface stencil. */ - if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) { - /* Trivial: don't need to calculate stencil values, and don't need to - * write them back to the framebuffer. + if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) { + /* No changes to any stencil values */ + need_to_calculate_stencil_values = false; + need_to_writemask_stencil_values = false; + } + else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) { + /* All changes are writemasked out, so no need to calculate + * what those changes might be, and no need to write anything back. */ need_to_calculate_stencil_values = false; need_to_writemask_stencil_values = false; } - else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) { + else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) { /* Still trivial, but a little less so. We need to write the stencil * values, but we don't need to mask them. */ @@ -1583,7 +1666,7 @@ gen_stencil_depth_test(struct spe_function *f, */ spe_comment(f, 0, "Running basic stencil test"); stencil_pass_reg = spe_allocate_available_register(f); - gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg); + gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg); /* If two-sided stenciling is on, generate code to run the stencil * test on the backfacing stencil as well, and combine the two results @@ -1592,7 +1675,7 @@ gen_stencil_depth_test(struct spe_function *f, if (dsa->stencil[1].enabled) { unsigned int temp_reg = spe_allocate_available_register(f); spe_comment(f, 0, "Running backface stencil test"); - gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg); + gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg); spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg); spe_release_register(f, temp_reg); } @@ -1914,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) * Z and/or stencil. We'll also convert the incoming fragment Z * value in fragZ_reg from a floating point value in [0.0..1.0] to * an unsigned integer value with the appropriate resolution. + * Note that even if depth or stencil is *not* enabled, if it's + * present in the buffer, we pull it out and put it back later; + * otherwise, we can inadvertently destroy the contents of + * buffers we're not supposed to touch (e.g., if the user is + * clearing the depth buffer but not the stencil buffer, a + * quad of constant depth is drawn over the surface; the stencil + * buffer must be maintained). */ switch(zs_format) { case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ case PIPE_FORMAT_X8Z24_UNORM: - if (dsa->depth.enabled) { - /* We need the Z part at least */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* four 24-bit Z values in the low-order bits */ - spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* four 8-bit Z values in the high-order bits */ - spe_rotmi(f, fbS_reg, fbZS_reg, -24); - } - break; + /* Pull out both Z and stencil */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* four 24-bit Z values in the low-order bits */ + spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* four 8-bit stencil values in the high-order bits */ + spe_rotmi(f, fbS_reg, fbZS_reg, -24); + break; case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ case PIPE_FORMAT_Z24X8_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* shift by 8 to get the upper 24-bit values */ - spe_rotmi(f, fbS_reg, fbZS_reg, -8); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* 8-bit stencil in the low-order bits - mask them out */ - spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); - } - break; + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* shift by 8 to get the upper 24-bit values */ + spe_rotmi(f, fbS_reg, fbZS_reg, -8); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* 8-bit stencil in the low-order bits - mask them out */ + spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); + break; case PIPE_FORMAT_Z32_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 32-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - } + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 32-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); /* No stencil, so can't do anything there */ - break; + break; case PIPE_FORMAT_Z16_UNORM: - if (dsa->depth.enabled) { - /* XXX Not sure this is correct, but it was here before, so we're - * going with it for now - */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 16-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -16); - } + /* XXX Not sure this is correct, but it was here before, so we're + * going with it for now + */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 16-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); /* No stencil */ - break; default: ASSERT(0); /* invalid format */ @@ -2035,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_comment(f, 0, "Store quad's depth/stencil values in tile"); if (zs_format == PIPE_FORMAT_S8Z24_UNORM || zs_format == PIPE_FORMAT_X8Z24_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - } - else { - spe_move(f, fbZS_reg, fbZ_reg); - } + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || zs_format == PIPE_FORMAT_Z24X8_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_move(f, fbZS_reg, fbS_reg); - } - else { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - } + spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z32_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_S8_UNORM) { ASSERT(0); /* XXX to do */ @@ -2080,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); } + /* Don't need these any more */ release_optional_register(f, &fbZ_reg_set, fbZ_reg); release_optional_register(f, &fbS_reg_set, fbS_reg); } diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h index b633880c25..c93958a9ed 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.h +++ b/src/gallium/drivers/cell/ppu/cell_spu.h @@ -30,7 +30,6 @@ #include <libspe2.h> -#include <libmisc.h> #include <pthread.h> #include "cell/common.h" diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 9ac2f3bbb9..ae88d06912 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -28,6 +28,7 @@ * Authors: * Keith Whitwell <keith@tungstengraphics.com> * Michel Dänzer <michel@tungstengraphics.com> + * Brian Paul */ #include "pipe/p_context.h" @@ -42,10 +43,9 @@ #include "cell_texture.h" -/* Simple, maximally packed layout. - */ -static unsigned minify( unsigned d ) +static unsigned +minify(unsigned d) { return MAX2(1, d>>1); } @@ -212,6 +212,89 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, /** + * For Cell. Basically, rearrange the pixels/quads from this layout: + * +--+--+--+--+ + * |p0|p1|p2|p3|.... + * +--+--+--+--+ + * + * to this layout: + * +--+--+ + * |p0|p1|.... + * +--+--+ + * |p2|p3| + * +--+--+ + */ +static void +twiddle_tile(const uint *tileIn, uint *tileOut) +{ + int y, x; + + for (y = 0; y < TILE_SIZE; y+=2) { + for (x = 0; x < TILE_SIZE; x+=2) { + int k = 4 * (y/2 * TILE_SIZE/2 + x/2); + tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k]; + tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1]; + tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2]; + tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3]; + } + } +} + + +/** + * Convert image from tiled layout to linear layout. 4-byte pixels. + */ +static void +untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, + uint dst_stride, const uint *src) +{ + const uint tile_size2 = tile_size * tile_size; + const uint h_t = (h + tile_size - 1) / tile_size; + const uint w_t = (w + tile_size - 1) / tile_size; + uint *tile_buf; + uint it, jt; /* tile counters */ + uint i, j; /* intra-tile counters */ + + dst_stride /= 4; /* convert from bytes to pixels */ + + tile_buf = align_malloc(tile_size * tile_size * 4, 16); + + /* loop over src tiles */ + for (it = 0; it < h_t; it++) { + for (jt = 0; jt < w_t; jt++) { + /* start of src tile: */ + const uint *tsrc = src + (it * w_t + jt) * tile_size2; + + twiddle_tile(tsrc, tile_buf); + tsrc = tile_buf; + + /* compute size of this tile (may be smaller than tile_size) */ + /* XXX note: a compiler bug was found here. That's why the code + * looks as it does. + */ + uint tile_width = w - jt * tile_size; + tile_width = MIN2(tile_width, tile_size); + uint tile_height = h - it * tile_size; + tile_height = MIN2(tile_height, tile_size); + + /* loop over texels in the tile */ + for (i = 0; i < tile_height; i++) { + for (j = 0; j < tile_width; j++) { + uint dsti = it * tile_size + i; + uint dstj = jt * tile_size + j; + ASSERT(dsti < h); + ASSERT(dstj < w); + dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j]; + } + } + } + } + + align_free(tile_buf); +} + + +/** * Convert linear texture image data to tiled format for SPU usage. */ static void @@ -230,6 +313,7 @@ cell_twiddle_texture(struct pipe_screen *screen, switch (ct->base.format) { case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: { int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; int offset = bufWidth * bufHeight * 4 * surface->face; @@ -261,6 +345,51 @@ cell_twiddle_texture(struct pipe_screen *screen, } +/** + * Convert SPU tiled texture image data to linear format for app usage. + */ +static void +cell_untwiddle_texture(struct pipe_screen *screen, + struct pipe_surface *surface) +{ + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + const uint texWidth = ct->base.width[level]; + const uint texHeight = ct->base.height[level]; + const void *map = pipe_buffer_map(screen, surface->buffer, + PIPE_BUFFER_USAGE_CPU_READ); + const uint *src = (const uint *) ((const ubyte *) map + surface->offset); + + switch (ct->base.format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: + { + int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; + int offset = surface->stride * texHeight * 4 * surface->face; + uint *dst; + + if (!ct->untiled_data[level]) { + ct->untiled_data[level] = + align_malloc(surface->stride * texHeight * 4 * numFaces, 16); + } + + dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset); + + untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, + surface->stride, src); + } + break; + default: + { + ct->untiled_data[level] = NULL; + printf("Cell: untwiddle unsupported texture format\n"); + } + } + + pipe_buffer_unmap(screen, surface->buffer); +} + + static struct pipe_surface * cell_get_tex_surface(struct pipe_screen *screen, struct pipe_texture *pt, @@ -294,13 +423,18 @@ cell_get_tex_surface(struct pipe_screen *screen, ps->zslice = zslice; if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) { - ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) * - ps->nblocksy * - ps->stride; + ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) * + ps->nblocksy * + ps->stride; } else { - assert(face == 0); - assert(zslice == 0); + assert(face == 0); + assert(zslice == 0); + } + + if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) { + /* convert from tiled to linear layout */ + cell_untwiddle_texture(screen, ps); } } return ps; @@ -311,6 +445,15 @@ static void cell_tex_surface_release(struct pipe_screen *screen, struct pipe_surface **s) { + struct cell_texture *ct = cell_texture((*s)->texture); + const uint level = (*s)->level; + + if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) + { + align_free(ct->untiled_data[level]); + ct->untiled_data[level] = NULL; + } + /* XXX if done rendering to teximage, re-tile */ pipe_texture_reference(&(*s)->texture, NULL); @@ -325,6 +468,10 @@ cell_surface_map(struct pipe_screen *screen, unsigned flags) { ubyte *map; + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + + assert(ct); if (flags & ~surface->usage) { assert(0); @@ -335,7 +482,14 @@ cell_surface_map(struct pipe_screen *screen, if (map == NULL) return NULL; else - return (void *) (map + surface->offset); + { + if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) { + return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset); + } + else { + return (void *) (map + surface->offset); + } + } } diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h index 2f5fe0dd1b..7018b0c9bf 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.h +++ b/src/gallium/drivers/cell/ppu/cell_texture.h @@ -52,6 +52,7 @@ struct cell_texture struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS]; /** Mapped, tiled texture data */ void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS]; + void *untiled_data[CELL_MAX_TEXTURE_LEVELS]; }; diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index a6ed29ea63..d726622d94 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd) CELL_FENCE_SIGNALLED}; uint *dst = (uint *) fence_cmd->fence; dst += 4 * spu.init.id; /* main store/memory address, not local store */ - + ASSERT_ALIGN16(dst); mfc_put((void *) &status, /* src in local memory */ (unsigned int) dst, /* dst in main memory */ sizeof(status), /* size */ @@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) } } - spu.read_depth = spu.depth_stencil_alpha.depth.enabled; - spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; + spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled); } diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c index 3534b35000..ff3d609d25 100644 --- a/src/gallium/drivers/cell/spu/spu_funcs.c +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -38,7 +38,9 @@ #include <math.h> #include <cos14_v.h> #include <sin14_v.h> -#include <transpose_matrix4x4.h> +#include <simdmath/exp2f4.h> +#include <simdmath/log2f4.h> +#include <simdmath/powf4.h> #include "cell/common.h" #include "spu_main.h" @@ -68,37 +70,19 @@ spu_sin(vector float x) static vector float spu_pow(vector float x, vector float y) { - float z0 = powf(spu_extract(x,0), spu_extract(y,0)); - float z1 = powf(spu_extract(x,1), spu_extract(y,1)); - float z2 = powf(spu_extract(x,2), spu_extract(y,2)); - float z3 = powf(spu_extract(x,3), spu_extract(y,3)); - return (vector float) {z0, z1, z2, z3}; + return _powf4(x, y); } static vector float spu_exp2(vector float x) { - float z0 = powf(2.0f, spu_extract(x,0)); - float z1 = powf(2.0f, spu_extract(x,1)); - float z2 = powf(2.0f, spu_extract(x,2)); - float z3 = powf(2.0f, spu_extract(x,3)); - return (vector float) {z0, z1, z2, z3}; + return _exp2f4(x); } static vector float spu_log2(vector float x) { - /* - * log_base_2(x) = log(x) / log(2) - * 1.442695 = 1/log(2). - */ - static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F}; - float z0 = logf(spu_extract(x,0)); - float z1 = logf(spu_extract(x,1)); - float z2 = logf(spu_extract(x,2)); - float z3 = logf(spu_extract(x,3)); - vector float v = (vector float) {z0, z1, z2, z3}; - return spu_mul(v, k); + return _log2f4(x); } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 668af10be2..692790c9f3 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -160,8 +160,7 @@ struct spu_global tile_t ztile ALIGN16_ATTRIB; /** Read depth/stencil tiles? */ - boolean read_depth; - boolean read_stencil; + boolean read_depth_stencil; /** Current tiles' status */ ubyte cur_ctile_status, cur_ztile_status; diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 5515bb55c9..7c225e2f27 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -98,7 +98,7 @@ my_tile(uint tx, uint ty) static INLINE void get_cz_tiles(uint tx, uint ty) { - if (spu.read_depth) { + if (spu.read_depth_stencil) { if (spu.cur_ztile_status != TILE_STATUS_CLEAR) { //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty); get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1); @@ -153,7 +153,7 @@ static INLINE void wait_put_cz_tiles(void) { wait_on_mask(1 << TAG_WRITE_TILE_COLOR); - if (spu.read_depth) { + if (spu.read_depth_stencil) { wait_on_mask(1 << TAG_WRITE_TILE_Z); } } diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 4caf7d6b61..5f908159bb 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -369,7 +369,7 @@ flush_spans(void) } ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); - if (spu.read_depth) { + if (spu.read_depth_stencil) { if (spu.cur_ztile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ztile\n", spu.init.id); diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c index 0111469405..31908a517b 100644 --- a/src/gallium/drivers/softpipe/sp_fs_sse.c +++ b/src/gallium/drivers/softpipe/sp_fs_sse.c @@ -40,7 +40,7 @@ #include "tgsi/tgsi_sse2.h" -#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) +#if defined(PIPE_ARCH_X86) #include "rtasm/rtasm_x86sse.h" @@ -92,7 +92,8 @@ fs_sse_run( const struct sp_fragment_shader *base, machine->Temps); /* init kill mask */ - machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] = 0x0; + tgsi_set_kill_mask(machine, 0x0); + tgsi_set_exec_mask(machine, 1, 1, 1, 1); shader->func( machine->Inputs, machine->Outputs, diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c index d05e12d1d9..b7aac7f84a 100644 --- a/src/gallium/drivers/softpipe/sp_quad_output.c +++ b/src/gallium/drivers/softpipe/sp_quad_output.c @@ -64,6 +64,14 @@ output_quad(struct quad_stage *qs, struct quad_header *quad) for (i = 0; i < 4; i++) { /* loop over color chans */ tile->data.color[y][x][i] = quadColor[i][j]; } + if (0) { + debug_printf("sp write pixel %d,%d: %g, %g, %g\n", + quad->input.x0 + x, + quad->input.y0 + y, + quadColor[0][j], + quadColor[1][j], + quadColor[2][j]); + } } } } diff --git a/src/gallium/include/pipe/p_debug.h b/src/gallium/include/pipe/p_debug.h index cb6196aa9f..3b00fb9aa8 100644 --- a/src/gallium/include/pipe/p_debug.h +++ b/src/gallium/include/pipe/p_debug.h @@ -49,7 +49,7 @@ extern "C" { #endif -#ifdef DBG +#if defined(DBG) || defined(DEBUG) #ifndef DEBUG #define DEBUG 1 #endif diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h index d70de8e301..5e79b7f485 100644 --- a/src/gallium/include/pipe/p_inlines.h +++ b/src/gallium/include/pipe/p_inlines.h @@ -82,11 +82,14 @@ static INLINE void pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf) { /* bump the refcount first */ - if (surf) + if (surf) { + assert(surf->refcount); surf->refcount++; + } if (*ptr) { - + assert((*ptr)->refcount); + /* There are currently two sorts of surfaces... This needs to be * fixed so that all surfaces are views into a texture. */ @@ -113,11 +116,16 @@ winsys_buffer_reference(struct pipe_winsys *winsys, struct pipe_buffer **ptr, struct pipe_buffer *buf) { - if (buf) + if (buf) { + assert(buf->refcount); buf->refcount++; + } - if (*ptr && --(*ptr)->refcount == 0) - winsys->buffer_destroy( winsys, *ptr ); + if (*ptr) { + assert((*ptr)->refcount); + if(--(*ptr)->refcount == 0) + winsys->buffer_destroy( winsys, *ptr ); + } *ptr = buf; } @@ -133,12 +141,15 @@ pipe_texture_reference(struct pipe_texture **ptr, { assert(ptr); - if (pt) + if (pt) { + assert(pt->refcount); pt->refcount++; + } if (*ptr) { struct pipe_screen *screen = (*ptr)->screen; assert(screen); + assert((*ptr)->refcount); screen->texture_release(screen, ptr); assert(!*ptr); @@ -154,6 +165,7 @@ pipe_texture_release(struct pipe_texture **ptr) struct pipe_screen *screen; assert(ptr); screen = (*ptr)->screen; + assert((*ptr)->refcount); screen->texture_release(screen, ptr); *ptr = NULL; } @@ -176,12 +188,6 @@ pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size ) return screen->winsys->user_buffer_create(screen->winsys, ptr, size); } -static INLINE void -pipe_buffer_destroy( struct pipe_screen *screen, struct pipe_buffer *buf ) -{ - screen->winsys->buffer_destroy(screen->winsys, buf); -} - static INLINE void * pipe_buffer_map(struct pipe_screen *screen, struct pipe_buffer *buf, |