summaryrefslogtreecommitdiff
path: root/src/gallium
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/auxiliary/draw/draw_pt.c113
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos.c17
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_exec.c6
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_ppc.c53
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_sse.c16
-rw-r--r--src/gallium/auxiliary/gallivm/gallivm_cpu.cpp6
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.cpp44
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.h10
-rw-r--r--src/gallium/auxiliary/gallivm/tgsitollvm.cpp11
-rw-r--r--src/gallium/auxiliary/pipebuffer/pb_buffer.h15
-rw-r--r--src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c3
-rw-r--r--src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c12
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_execmem.c19
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc.c79
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc.h9
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c57
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_exec.c104
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_exec.h25
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_ppc.c767
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sanity.c14
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.c153
-rw-r--r--src/gallium/auxiliary/util/p_debug.c39
-rw-r--r--src/gallium/auxiliary/util/u_math.h2
-rw-r--r--src/gallium/auxiliary/util/u_mm.c16
-rw-r--r--src/gallium/auxiliary/util/u_mm.h12
-rw-r--r--src/gallium/auxiliary/util/u_rect.c3
-rw-r--r--src/gallium/auxiliary/util/u_tile.c35
-rw-r--r--src/gallium/auxiliary/util/u_time.c2
-rw-r--r--src/gallium/drivers/cell/common.h2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_batch.c19
-rw-r--r--src/gallium/drivers/cell/ppu/cell_clear.c13
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.h2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_fence.c14
-rw-r--r--src/gallium/drivers/cell/ppu/cell_flush.c2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c103
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.c332
-rw-r--r--src/gallium/drivers/cell/ppu/cell_spu.h1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.c172
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.h1
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.c5
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.c28
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.h3
-rw-r--r--src/gallium/drivers/cell/spu/spu_render.c4
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c2
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_sse.c5
-rw-r--r--src/gallium/drivers/softpipe/sp_quad_output.c8
-rw-r--r--src/gallium/include/pipe/p_debug.h2
-rw-r--r--src/gallium/include/pipe/p_inlines.h30
48 files changed, 1806 insertions, 584 deletions
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 87ec6ae20c..3c175f31d8 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -33,6 +33,8 @@
#include "draw/draw_context.h"
#include "draw/draw_private.h"
#include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+#include "tgsi/tgsi_dump.h"
static unsigned trim( unsigned count, unsigned first, unsigned incr )
{
@@ -176,6 +178,92 @@ void draw_pt_destroy( struct draw_context *draw )
}
+/**
+ * Debug- print the first 'count' vertices.
+ */
+static void
+draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
+{
+ uint i;
+
+ debug_printf("Draw arrays(prim = %u, start = %u, count = %u)\n",
+ prim, start, count);
+
+ for (i = 0; i < count; i++) {
+ uint ii, j;
+
+ if (draw->pt.user.elts) {
+ /* indexed arrays */
+ switch (draw->pt.user.eltSize) {
+ case 1:
+ {
+ const ubyte *elem = (const ubyte *) draw->pt.user.elts;
+ ii = elem[start + i];
+ }
+ break;
+ case 2:
+ {
+ const ushort *elem = (const ushort *) draw->pt.user.elts;
+ ii = elem[start + i];
+ }
+ break;
+ case 4:
+ {
+ const uint *elem = (const uint *) draw->pt.user.elts;
+ ii = elem[start + i];
+ }
+ break;
+ default:
+ assert(0);
+ }
+ debug_printf("Element[%u + %u] -> Vertex %u:\n", start, i, ii);
+ }
+ else {
+ /* non-indexed arrays */
+ ii = start + i;
+ debug_printf("Vertex %u:\n", ii);
+ }
+
+ for (j = 0; j < draw->pt.nr_vertex_elements; j++) {
+ uint buf = draw->pt.vertex_element[j].vertex_buffer_index;
+ ubyte *ptr = (ubyte *) draw->pt.user.vbuffer[buf];
+ ptr += draw->pt.vertex_buffer[buf].pitch * ii;
+ ptr += draw->pt.vertex_element[j].src_offset;
+
+ debug_printf(" Attr %u: ", j);
+ switch (draw->pt.vertex_element[j].src_format) {
+ case PIPE_FORMAT_R32_FLOAT:
+ {
+ float *v = (float *) ptr;
+ debug_printf("%f @ %p\n", v[0], (void *) v);
+ }
+ break;
+ case PIPE_FORMAT_R32G32_FLOAT:
+ {
+ float *v = (float *) ptr;
+ debug_printf("%f %f @ %p\n", v[0], v[1], (void *) v);
+ }
+ break;
+ case PIPE_FORMAT_R32G32B32_FLOAT:
+ {
+ float *v = (float *) ptr;
+ debug_printf("%f %f %f @ %p\n", v[0], v[1], v[2], (void *) v);
+ }
+ break;
+ case PIPE_FORMAT_R32G32B32A32_FLOAT:
+ {
+ float *v = (float *) ptr;
+ debug_printf("%f %f %f %f @ %p\n", v[0], v[1], v[2], v[3],
+ (void *) v);
+ }
+ break;
+ default:
+ debug_printf("other format (fix me)\n");
+ ;
+ }
+ }
+ }
+}
/**
@@ -195,6 +283,31 @@ draw_arrays(struct draw_context *draw, unsigned prim,
draw->reduced_prim = reduced_prim;
}
+ if (0)
+ draw_print_arrays(draw, prim, start, MIN2(count, 20));
+
+#if 0
+ {
+ int i;
+ debug_printf("draw_arrays(prim=%u start=%u count=%u):\n",
+ prim, start, count);
+ tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
+ debug_printf("Elements:\n");
+ for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+ debug_printf(" format=%s comps=%u\n",
+ pf_name(draw->pt.vertex_element[i].src_format),
+ draw->pt.vertex_element[i].nr_components);
+ }
+ debug_printf("Buffers:\n");
+ for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+ debug_printf(" pitch=%u offset=%u ptr=%p\n",
+ draw->pt.vertex_buffer[i].pitch,
+ draw->pt.vertex_buffer[i].buffer_offset,
+ draw->pt.user.vbuffer[i]);
+ }
+ }
+#endif
+
/* drawing done here: */
draw_pt_arrays(draw, prim, start, count);
}
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 87232865e2..6141ba9cbf 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -1632,6 +1632,17 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst
return TRUE;
}
+static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+ struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+ struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+
+ sse2_cvttps2dq(cp->func, tmp0, arg0);
+ sse2_cvtdq2ps(cp->func, tmp0, tmp0);
+
+ store_dest(cp, &op->FullDstRegisters[0], tmp0);
+ return TRUE;
+}
static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
@@ -1770,6 +1781,9 @@ emit_instruction( struct aos_compilation *cp,
case TGSI_OPCODE_SIN:
return emit_SIN(cp, inst);
+ case TGSI_OPCODE_TRUNC:
+ return emit_TRUNC(cp, inst);
+
case TGSI_OPCODE_END:
return TRUE;
@@ -2176,7 +2190,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
if (!vaos->buffer)
goto fail;
- debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+ if (0)
+ debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
#if 0
tgsi_dump(vs->state.tokens, 0);
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 13d4fcfdbf..80c3606657 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -123,6 +123,12 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
input = (const float (*)[4])((const char *)input + input_stride);
}
+ tgsi_set_exec_mask(machine,
+ 1,
+ max_vertices > 1,
+ max_vertices > 2,
+ max_vertices > 3);
+
/* run interpreter */
tgsi_exec_machine_run( machine );
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index 8eff6d4fda..8b75136144 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -54,31 +54,16 @@
typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
float (*outputs)[4][4],
float (*temps)[4][4],
- float (*immeds)[4][4],
+ float (*immeds)[4],
float (*consts)[4],
const float *builtins);
-#if 0
- const struct tgsi_exec_vector *input,
- struct tgsi_exec_vector *output,
- float (*constant)[4], /* 3 */
- struct tgsi_exec_vector *temporary, /* 4 */
- float (*immediates)[4], /* 5 */
- const float (*aos_input)[4], /* 6 */
- uint num_inputs, /* 7 */
- uint input_stride, /* 8 */
- float (*aos_output)[4], /* 9 */
- uint num_outputs, /* 10 */
- uint output_stride ); /* 11 */
-#endif
struct draw_ppc_vertex_shader {
struct draw_vertex_shader base;
struct ppc_function ppc_program;
codegen_function func;
-
- struct tgsi_exec_machine *machine;
};
@@ -86,11 +71,12 @@ static void
vs_ppc_prepare( struct draw_vertex_shader *base,
struct draw_context *draw )
{
+ /* nothing */
}
-
-/* Simplified vertex shader interface for the pt paths. Given the
+/**
+ * Simplified vertex shader interface for the pt paths. Given the
* complexity of code-generating all the above operations together,
* it's time to try doing all the other stuff separately.
*/
@@ -104,7 +90,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
unsigned output_stride )
{
struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
- struct tgsi_exec_machine *machine = shader->machine;
unsigned int i;
#define MAX_VERTICES 4
@@ -137,27 +122,11 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
/* run compiled shader
*/
-#if 0
- shader->func(machine->Inputs,
- machine->Outputs,
- (float (*)[4])constants,
- machine->Temps,
- (float (*)[4])shader->base.immediates,
- input,
- base->info.num_inputs,
- input_stride,
- output,
- base->info.num_outputs,
- output_stride );
-#else
shader->func(inputs_soa, outputs_soa, temps_soa,
- (float (*)[4][4]) shader->base.immediates,
+ (float (*)[4]) shader->base.immediates,
(float (*)[4]) constants,
ppc_builtin_constants);
- /*output[0][0] = input[0][0] * 0.5;*/
-#endif
-
/* convert (up to) four output verts from SoA back to AoS format */
for (attr = 0; attr < base->info.num_outputs; attr++) {
float *vOut = (float *) output;
@@ -183,8 +152,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
}
-
-
static void
vs_ppc_delete( struct draw_vertex_shader *base )
{
@@ -201,7 +168,7 @@ vs_ppc_delete( struct draw_vertex_shader *base )
struct draw_vertex_shader *
draw_create_vs_ppc(struct draw_context *draw,
- const struct pipe_shader_state *templ)
+ const struct pipe_shader_state *templ)
{
struct draw_ppc_vertex_shader *vs;
@@ -227,16 +194,14 @@ draw_create_vs_ppc(struct draw_context *draw,
vs->base.run_linear = vs_ppc_run_linear;
vs->base.delete = vs_ppc_delete;
- vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 *
+ vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
sizeof(float), 16);
- vs->machine = &draw->vs.machine;
-
- ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
+ ppc_init_func( &vs->ppc_program );
if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
&vs->ppc_program,
- (float (*)[4])vs->base.immediates,
+ (float (*)[4]) vs->base.immediates,
TRUE ))
goto fail;
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index b11ae31662..77ba5152f9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
#include "draw_vs.h"
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
#include "pipe/p_shader_tokens.h"
@@ -99,9 +99,23 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
struct tgsi_exec_machine *machine = shader->machine;
unsigned int i;
+ /* By default, execute all channels. XXX move this inside the loop
+ * below when we support shader conditionals/loops.
+ */
+ tgsi_set_exec_mask(machine, 1, 1, 1, 1);
+
for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
+ if (max_vertices < 4) {
+ /* disable the unused execution channels */
+ tgsi_set_exec_mask(machine,
+ 1,
+ max_vertices > 1,
+ max_vertices > 2,
+ 0);
+ }
+
/* run compiled shader
*/
shader->func(machine->Inputs,
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 3a2f2878a3..93a9748bdb 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -179,8 +179,7 @@ struct gallivm_cpu_engine * gallivm_global_cpu_engine()
typedef void (*vertex_shader_runner)(void *ainputs,
void *dests,
- float (*aconsts)[4],
- void *temps);
+ float (*aconsts)[4]);
#define MAX_TGSI_VERTICES 4
/*!
@@ -223,8 +222,7 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
/* run shader */
runner(machine->Inputs,
machine->Outputs,
- (float (*)[4]) constants,
- machine->Temps);
+ (float (*)[4]) constants);
/* Unswizzle all output results
*/
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 4fc075cf6d..e1e5cabcf5 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -48,13 +48,11 @@ using namespace llvm;
StorageSoa::StorageSoa(llvm::BasicBlock *block,
llvm::Value *input,
llvm::Value *output,
- llvm::Value *consts,
- llvm::Value *temps)
+ llvm::Value *consts)
: m_block(block),
m_input(input),
m_output(output),
m_consts(consts),
- m_temps(temps),
m_immediates(0),
m_idx(0)
{
@@ -169,7 +167,7 @@ std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder,
{
llvm::Value* res;
std::vector<llvm::Value*> res2(4);
- llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
+ llvm::Value *xChannel;
xChannel = elementPointer(m_consts, idx, 0);
@@ -195,14 +193,15 @@ std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
return res;
}
-std::vector<llvm::Value*> StorageSoa::tempElement(llvm::Value *idx)
+std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx)
{
std::vector<llvm::Value*> res(4);
+ llvm::Value *temp = m_temps[idx];
- res[0] = element(m_temps, idx, 0);
- res[1] = element(m_temps, idx, 1);
- res[2] = element(m_temps, idx, 2);
- res[3] = element(m_temps, idx, 3);
+ res[0] = element(temp, constantInt(0), 0);
+ res[1] = element(temp, constantInt(0), 1);
+ res[2] = element(temp, constantInt(0), 2);
+ res[3] = element(temp, constantInt(0), 3);
return res;
}
@@ -326,7 +325,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
val = outputElement(realIndex);
break;
case TGSI_FILE_TEMPORARY:
- val = tempElement(realIndex);
+ val = tempElement(m_builder, idx);
break;
case TGSI_FILE_CONSTANT:
val = constElement(m_builder, realIndex);
@@ -355,19 +354,39 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
return res;
}
+llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder)
+{
+ VectorType *vector = VectorType::get(Type::FloatTy, 4);
+ ArrayType *vecArray = ArrayType::get(vector, 4);
+ AllocaInst *alloca = new AllocaInst(vecArray, "temp",
+ m_builder->GetInsertBlock());
+
+ return alloca;
+}
+
+
void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
- int mask)
+ int mask, llvm::IRBuilder<>* m_builder)
{
llvm::Value *out = 0;
+ llvm::Value *realIndex = 0;
switch(type) {
case TGSI_FILE_OUTPUT:
out = m_output;
+ realIndex = constantInt(idx);
break;
case TGSI_FILE_TEMPORARY:
- out = m_temps;
+ // if that temp doesn't already exist, alloca it
+ if (m_temps.find(idx) == m_temps.end())
+ m_temps[idx] = allocaTemp(m_builder);
+
+ out = m_temps[idx];
+
+ realIndex = constantInt(0);
break;
case TGSI_FILE_INPUT:
out = m_input;
+ realIndex = constantInt(idx);
break;
case TGSI_FILE_ADDRESS: {
llvm::Value *addr = m_addresses[idx];
@@ -385,7 +404,6 @@ void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm
assert(0);
break;
}
- llvm::Value *realIndex = constantInt(idx);
if ((mask & TGSI_WRITEMASK_X)) {
llvm::Value *xChannel = elementPointer(out, realIndex, 0);
new StoreInst(val[0], xChannel, false, m_block);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index f21ca6ec43..56886f85e7 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -52,14 +52,13 @@ public:
StorageSoa(llvm::BasicBlock *block,
llvm::Value *input,
llvm::Value *output,
- llvm::Value *consts,
- llvm::Value *temps);
+ llvm::Value *consts);
std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle,
llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
- int mask);
+ int mask, llvm::IRBuilder<>* m_builder);
void addImmediate(float *vec);
void declareImmediates();
@@ -84,7 +83,7 @@ private:
llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
- std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
+ std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx);
std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
private:
llvm::BasicBlock *m_block;
@@ -92,12 +91,13 @@ private:
llvm::Value *m_input;
llvm::Value *m_output;
llvm::Value *m_consts;
- llvm::Value *m_temps;
+ std::map<int, llvm::Value*> m_temps;
llvm::GlobalVariable *m_immediates;
std::map<int, llvm::Value*> m_addresses;
std::vector<std::vector<float> > m_immediatesToFlush;
+ llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder);
mutable std::map<int, llvm::ConstantInt*> m_constInts;
mutable char m_name[32];
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 1191a6cae9..c11b88af9e 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -53,7 +53,6 @@ static inline FunctionType *vertexShaderFunctionType()
// [4 x <4 x float>] inputs,
// [4 x <4 x float>] output,
// [4 x [1 x float]] consts,
- // [4 x <4 x float>] temps
std::vector<const Type*> funcArgs;
VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -67,7 +66,6 @@ static inline FunctionType *vertexShaderFunctionType()
funcArgs.push_back(vectorArrayPtr);//inputs
funcArgs.push_back(vectorArrayPtr);//output
funcArgs.push_back(constsArrayPtr);//consts
- funcArgs.push_back(vectorArrayPtr);//temps
FunctionType *functionType = FunctionType::get(
/*Result=*/Type::VoidTy,
@@ -246,7 +244,6 @@ translate_instruction(llvm::Module *module,
val = storage->constElement(src->SrcRegister.Index, indIdx);
} else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
val = storage->inputElement(src->SrcRegister.Index, indIdx);
- // FIXME we should not be generating elements for temporaries, this creates useless memory writes
} else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
val = storage->tempElement(src->SrcRegister.Index);
} else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -677,7 +674,6 @@ translate_instruction(llvm::Module *module,
if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
- // FIXME we should not be generating elements for temporaries, this creates useless memory writes
} else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
} else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -1027,7 +1023,8 @@ translate_instructionir(llvm::Module *module,
for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
storage->store((enum tgsi_file_type)dst->DstRegister.File,
- dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+ dst->DstRegister.Index, out, dst->DstRegister.WriteMask,
+ instr->getIRBuilder() );
}
}
@@ -1122,8 +1119,6 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
output->setName("outputs");
Value *consts = args++;
consts->setName("consts");
- Value *temps = args++;
- temps->setName("temps");
BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
@@ -1132,7 +1127,7 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
fi = tgsi_default_full_instruction();
fd = tgsi_default_full_declaration();
- StorageSoa storage(label_entry, input, output, consts, temps);
+ StorageSoa storage(label_entry, input, output, consts);
InstructionsSoa instr(mod, shader, label_entry, &storage);
while(!tgsi_parse_end_of_tokens(&parse)) {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 8505d333bd..19db8a6a91 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -177,12 +177,16 @@ pb_get_base_buffer( struct pb_buffer *buf,
}
+/**
+ * Don't call this directly. Use pb_reference instead.
+ */
static INLINE void
pb_destroy(struct pb_buffer *buf)
{
assert(buf);
if(!buf)
return;
+ assert(buf->base.refcount == 0);
buf->vtbl->destroy(buf);
}
@@ -193,11 +197,16 @@ static INLINE void
pb_reference(struct pb_buffer **dst,
struct pb_buffer *src)
{
- if (src)
+ if (src) {
+ assert(src->base.refcount);
src->base.refcount++;
+ }
- if (*dst && --(*dst)->base.refcount == 0)
- pb_destroy( *dst );
+ if (*dst) {
+ assert((*dst)->base.refcount);
+ if(--(*dst)->base.refcount == 0)
+ pb_destroy( *dst );
+ }
*dst = src;
}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 633ee70a75..e2594ea236 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -86,8 +86,7 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
if(!fenced_buf) {
- assert(buf->base.refcount == 1);
- pb_destroy(buf);
+ pb_reference(&buf, NULL);
}
return fenced_buf;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index fe80ca30ee..a976d3041a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -100,7 +100,7 @@ mm_buffer_destroy(struct pb_buffer *buf)
assert(buf->base.refcount == 0);
pipe_mutex_lock(mm->mutex);
- mmFreeMem(mm_buf->block);
+ u_mmFreeMem(mm_buf->block);
FREE(buf);
pipe_mutex_unlock(mm->mutex);
}
@@ -175,14 +175,14 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
mm_buf->mgr = mm;
- mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+ mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
if(!mm_buf->block) {
debug_printf("warning: heap full\n");
#if 0
mmDumpMemInfo(mm->heap);
#endif
- mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+ mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
if(!mm_buf->block) {
FREE(mm_buf);
pipe_mutex_unlock(mm->mutex);
@@ -213,7 +213,7 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
pipe_mutex_lock(mm->mutex);
- mmDestroy(mm->heap);
+ u_mmDestroy(mm->heap);
pb_unmap(mm->buffer);
pb_reference(&mm->buffer, NULL);
@@ -254,7 +254,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
if(!mm->map)
goto failure;
- mm->heap = mmInit(0, size);
+ mm->heap = u_mmInit(0, size);
if (!mm->heap)
goto failure;
@@ -262,7 +262,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
failure:
if(mm->heap)
- mmDestroy(mm->heap);
+ u_mmDestroy(mm->heap);
if(mm->map)
pb_unmap(mm->buffer);
if(mm)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 19087589a8..be7433baf8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -38,12 +38,13 @@
#include "rtasm_execmem.h"
-#if defined(__linux__)
+#if defined(PIPE_OS_LINUX)
+
/*
* Allocate a large block of memory which can hold code then dole it out
* in pieces by means of the generic memory manager code.
-*/
+ */
#include <unistd.h>
#include <sys/mman.h>
@@ -62,7 +63,7 @@ static void
init_heap(void)
{
if (!exec_heap)
- exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+ exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
if (!exec_mem)
exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE,
@@ -82,8 +83,8 @@ rtasm_exec_malloc(size_t size)
init_heap();
if (exec_heap) {
- size = (size + 31) & ~31;
- block = mmAllocMem( exec_heap, size, 32, 0 );
+ size = (size + 31) & ~31; /* next multiple of 32 bytes */
+ block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
}
if (block)
@@ -103,17 +104,17 @@ rtasm_exec_free(void *addr)
pipe_mutex_lock(exec_mutex);
if (exec_heap) {
- struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+ struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
if (block)
- mmFreeMem(block);
+ u_mmFreeMem(block);
}
pipe_mutex_unlock(exec_mutex);
}
-#else
+#else /* PIPE_OS_LINUX */
/*
* Just use regular memory.
@@ -133,4 +134,4 @@ rtasm_exec_free(void *addr)
}
-#endif
+#endif /* PIPE_OS_LINUX */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 7dd8263749..6d11263be8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -38,17 +38,18 @@
#include <stdio.h>
#include "util/u_memory.h"
#include "pipe/p_debug.h"
+#include "rtasm_execmem.h"
#include "rtasm_ppc.h"
void
-ppc_init_func(struct ppc_function *p, unsigned max_inst)
+ppc_init_func(struct ppc_function *p)
{
uint i;
- p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
p->num_inst = 0;
- p->max_inst = max_inst;
+ p->max_inst = 100; /* first guess at buffer size */
+ p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
p->reg_used = 0x0;
p->fp_used = 0x0;
p->vec_used = 0x0;
@@ -66,12 +67,19 @@ ppc_release_func(struct ppc_function *p)
{
assert(p->num_inst <= p->max_inst);
if (p->store != NULL) {
- align_free(p->store);
+ rtasm_exec_free(p->store);
}
p->store = NULL;
}
+uint
+ppc_num_instructions(const struct ppc_function *p)
+{
+ return p->num_inst;
+}
+
+
void (*ppc_get_func(struct ppc_function *p))(void)
{
#if 0
@@ -202,6 +210,35 @@ ppc_release_vec_register(struct ppc_function *p, int reg)
}
+/**
+ * Append instruction to instruction buffer. Grow buffer if out of room.
+ */
+static void
+emit_instruction(struct ppc_function *p, uint32_t inst_bits)
+{
+ if (!p->store)
+ return; /* out of memory, drop the instruction */
+
+ if (p->num_inst == p->max_inst) {
+ /* allocate larger buffer */
+ uint32_t *newbuf;
+ p->max_inst *= 2; /* 2x larger */
+ newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+ if (newbuf) {
+ memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE);
+ }
+ rtasm_exec_free(p->store);
+ p->store = newbuf;
+ if (!p->store) {
+ /* out of memory */
+ p->num_inst = 0;
+ return;
+ }
+ }
+
+ p->store[p->num_inst++] = inst_bits;
+}
+
union vx_inst {
uint32_t bits;
@@ -223,8 +260,7 @@ emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
inst.inst.vA = vA;
inst.inst.vB = vB;
inst.inst.op2 = op2;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
};
@@ -250,8 +286,7 @@ emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
inst.inst.vB = vB;
inst.inst.rC = 0;
inst.inst.op2 = op2;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
};
@@ -277,8 +312,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
inst.inst.vB = vB;
inst.inst.vC = vC;
inst.inst.op2 = op2;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
};
@@ -300,8 +334,7 @@ emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
inst.inst.li = li;
inst.inst.aa = aa;
inst.inst.lk = lk;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
}
@@ -330,8 +363,7 @@ emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
inst.inst.bh = bh;
inst.inst.op2 = op2;
inst.inst.lk = lk;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
}
static INLINE void
@@ -373,8 +405,7 @@ emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
inst.inst.rb = rb;
inst.inst.op2 = op2;
inst.inst.unused = 0x0;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
}
@@ -398,8 +429,7 @@ emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
inst.inst.rt = rt;
inst.inst.ra = ra;
inst.inst.si = (unsigned) (si & 0xffff);
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
};
@@ -428,8 +458,7 @@ emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
inst.inst.unused = 0x0;
inst.inst.op2 = op2;
inst.inst.rc = rc;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
};
@@ -458,8 +487,7 @@ emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
inst.inst.oe = oe;
inst.inst.op2 = op2;
inst.inst.rc = rc;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
}
@@ -505,6 +533,13 @@ ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
}
+/** vector float negative mult subtract: vD = vA - vB * vC */
+void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+}
+
/** vector float compare greater than */
void
ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f938d8d759..afb4704c39 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -62,8 +62,9 @@ struct ppc_function
-extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_init_func(struct ppc_function *p);
extern void ppc_release_func(struct ppc_function *p);
+extern uint ppc_num_instructions(const struct ppc_function *p);
extern void (*ppc_get_func( struct ppc_function *p ))( void );
extern void ppc_dump_func(const struct ppc_function *p);
@@ -97,10 +98,14 @@ ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
extern void
ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
-/** vector float mult add */
+/** vector float mult add: vD = vA * vB + vC */
extern void
ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+/** vector float negative mult subtract: vD = vA - vB * vC */
+extern void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
/** vector float compare greater than */
extern void
ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index dea1aed032..f8568f690b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -185,6 +185,34 @@ reg_name(int reg)
}
+static void
+emit_instruction(struct spe_function *p, uint32_t inst_bits)
+{
+ if (!p->store)
+ return; /* out of memory, drop the instruction */
+
+ if (p->num_inst == p->max_inst) {
+ /* allocate larger buffer */
+ uint32_t *newbuf;
+ p->max_inst *= 2; /* 2x larger */
+ newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16);
+ if (newbuf) {
+ memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE);
+ }
+ align_free(p->store);
+ p->store = newbuf;
+ if (!p->store) {
+ /* out of memory */
+ p->num_inst = 0;
+ return;
+ }
+ }
+
+ p->store[p->num_inst++] = inst_bits;
+}
+
+
+
static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
unsigned rA, unsigned rB, const char *name)
{
@@ -193,8 +221,7 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rB = rB;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
if (p->print) {
indent(p);
printf("%s\t%s, %s, %s\n",
@@ -212,8 +239,7 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rB = rB;
inst.inst.rA = rA;
inst.inst.rC = rC;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
if (p->print) {
indent(p);
printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
@@ -230,8 +256,7 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.i7 = imm;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
if (p->print) {
indent(p);
printf("%s\t%s, %s, 0x%x\n",
@@ -249,8 +274,7 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.i8 = imm;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
if (p->print) {
indent(p);
printf("%s\t%s, %s, 0x%x\n",
@@ -268,8 +292,7 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.i10 = imm;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
if (p->print) {
indent(p);
printf("%s\t%s, %s, 0x%x\n",
@@ -295,8 +318,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.op = op;
inst.inst.i16 = imm;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
if (p->print) {
indent(p);
printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -311,8 +333,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.op = op;
inst.inst.i18 = imm;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
if (p->print) {
indent(p);
printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -394,15 +415,19 @@ void _name (struct spe_function *p, int imm) \
/**
* Initialize an spe_function.
- * \param code_size size of instruction buffer to allocate, in bytes.
+ * \param code_size initial size of instruction buffer to allocate, in bytes.
+ * If zero, use a default.
*/
void spe_init_func(struct spe_function *p, unsigned code_size)
{
unsigned int i;
- p->store = align_malloc(code_size, 16);
+ if (!code_size)
+ code_size = 64;
+
p->num_inst = 0;
p->max_inst = code_size / SPE_INST_SIZE;
+ p->store = align_malloc(code_size, 16);
p->set_count = 0;
memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 1a5294eabc..1da04ab7e0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -958,6 +958,10 @@ fetch_src_file_channel(
switch( file ) {
case TGSI_FILE_CONSTANT:
assert(mach->Consts);
+ assert(index->i[0] >= 0);
+ assert(index->i[1] >= 0);
+ assert(index->i[2] >= 0);
+ assert(index->i[3] >= 0);
chan->f[0] = mach->Consts[index->i[0]][swizzle];
chan->f[1] = mach->Consts[index->i[1]][swizzle];
chan->f[2] = mach->Consts[index->i[2]][swizzle];
@@ -1041,12 +1045,16 @@ fetch_source(
if (reg->SrcRegister.Indirect) {
union tgsi_exec_channel index2;
union tgsi_exec_channel indir_index;
+ const uint execmask = mach->ExecMask;
+ uint i;
+ /* which address register (always zero now) */
index2.i[0] =
index2.i[1] =
index2.i[2] =
index2.i[3] = reg->SrcRegisterInd.Index;
+ /* get current value of address register[swizzle] */
swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
fetch_src_file_channel(
mach,
@@ -1055,10 +1063,19 @@ fetch_source(
&index2,
&indir_index );
+ /* add value of address register to the offset */
index.i[0] += indir_index.i[0];
index.i[1] += indir_index.i[1];
index.i[2] += indir_index.i[2];
index.i[3] += indir_index.i[3];
+
+ /* for disabled execution channels, zero-out the index to
+ * avoid using a potential garbage value.
+ */
+ for (i = 0; i < QUAD_SIZE; i++) {
+ if ((execmask & (1 << i)) == 0)
+ index.i[i] = 0;
+ }
}
if( reg->SrcRegister.Dimension ) {
@@ -1087,6 +1104,8 @@ fetch_source(
if (reg->SrcRegisterDim.Indirect) {
union tgsi_exec_channel index2;
union tgsi_exec_channel indir_index;
+ const uint execmask = mach->ExecMask;
+ uint i;
index2.i[0] =
index2.i[1] =
@@ -1105,6 +1124,14 @@ fetch_source(
index.i[1] += indir_index.i[1];
index.i[2] += indir_index.i[2];
index.i[3] += indir_index.i[3];
+
+ /* for disabled execution channels, zero-out the index to
+ * avoid using a potential garbage value.
+ */
+ for (i = 0; i < QUAD_SIZE; i++) {
+ if ((execmask & (1 << i)) == 0)
+ index.i[i] = 0;
+ }
}
}
@@ -2007,7 +2034,21 @@ exec_instruction(
case TGSI_OPCODE_DOT2ADD:
/* TGSI_OPCODE_DP2A */
- assert (0);
+ FETCH( &r[0], 0, CHAN_X );
+ FETCH( &r[1], 1, CHAN_X );
+ micro_mul( &r[0], &r[0], &r[1] );
+
+ FETCH( &r[1], 0, CHAN_Y );
+ FETCH( &r[2], 1, CHAN_Y );
+ micro_mul( &r[1], &r[1], &r[2] );
+ micro_add( &r[0], &r[0], &r[1] );
+
+ FETCH( &r[2], 2, CHAN_X );
+ micro_add( &r[0], &r[0], &r[2] );
+
+ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( &r[0], 0, chan_index );
+ }
break;
case TGSI_OPCODE_INDEX:
@@ -2436,7 +2477,66 @@ exec_instruction(
break;
case TGSI_OPCODE_NRM:
- assert (0);
+ /* 3-component vector normalize */
+ {
+ union tgsi_exec_channel tmp, dot;
+
+ /* tmp = dp3(src0, src0): */
+ FETCH( &r[0], 0, CHAN_X );
+ micro_mul( &tmp, &r[0], &r[0] );
+
+ FETCH( &r[1], 0, CHAN_Y );
+ micro_mul( &dot, &r[1], &r[1] );
+ micro_add( &tmp, &tmp, &dot );
+
+ FETCH( &r[2], 0, CHAN_Z );
+ micro_mul( &dot, &r[2], &r[2] );
+ micro_add( &tmp, &tmp, &dot );
+
+ /* tmp = 1 / sqrt(tmp) */
+ micro_sqrt( &tmp, &tmp );
+ micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+ /* note: w channel is undefined */
+ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+ /* chan = chan * tmp */
+ micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+ STORE( &r[chan_index], 0, chan_index );
+ }
+ }
+ break;
+
+ case TGSI_OPCODE_NRM4:
+ /* 4-component vector normalize */
+ {
+ union tgsi_exec_channel tmp, dot;
+
+ /* tmp = dp4(src0, src0): */
+ FETCH( &r[0], 0, CHAN_X );
+ micro_mul( &tmp, &r[0], &r[0] );
+
+ FETCH( &r[1], 0, CHAN_Y );
+ micro_mul( &dot, &r[1], &r[1] );
+ micro_add( &tmp, &tmp, &dot );
+
+ FETCH( &r[2], 0, CHAN_Z );
+ micro_mul( &dot, &r[2], &r[2] );
+ micro_add( &tmp, &tmp, &dot );
+
+ FETCH( &r[3], 0, CHAN_W );
+ micro_mul( &dot, &r[3], &r[3] );
+ micro_add( &tmp, &tmp, &dot );
+
+ /* tmp = 1 / sqrt(tmp) */
+ micro_sqrt( &tmp, &tmp );
+ micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+ /* chan = chan * tmp */
+ micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+ STORE( &r[chan_index], 0, chan_index );
+ }
+ }
break;
case TGSI_OPCODE_DIV:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index c4e649e69c..fc40a25e09 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -165,6 +165,10 @@ struct tgsi_exec_labels
#define TGSI_EXEC_TEMP_HALF_I (TGSI_EXEC_NUM_TEMPS + 3)
#define TGSI_EXEC_TEMP_HALF_C 1
+/* execution mask, each value is either 0 or ~0 */
+#define TGSI_EXEC_MASK_I (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_MASK_C 2
+
#define TGSI_EXEC_TEMP_R0 (TGSI_EXEC_NUM_TEMPS + 4)
#define TGSI_EXEC_TEMP_ADDR (TGSI_EXEC_NUM_TEMPS + 5)
@@ -265,6 +269,27 @@ void
tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach);
+static INLINE void
+tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
+{
+ mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
+ mask;
+}
+
+
+/** Set execution mask values prior to executing the shader */
+static INLINE void
+tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
+ boolean ch0, boolean ch1, boolean ch2, boolean ch3)
+{
+ int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
+ mask[0] = ch0 ? ~0 : 0;
+ mask[1] = ch1 ? ~0 : 0;
+ mask[2] = ch2 ? ~0 : 0;
+ mask[3] = ch3 ? ~0 : 0;
+}
+
+
#if defined __cplusplus
} /* extern "C" */
#endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9ad7ecd7cf..a92b1902e3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -40,6 +40,7 @@
#include "util/u_sse.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
+#include "tgsi_dump.h"
#include "tgsi_exec.h"
#include "tgsi_ppc.h"
#include "rtasm/rtasm_ppc.h"
@@ -72,11 +73,20 @@ const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
#define CHAN_Z 2
#define CHAN_W 3
-#define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
-#define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
-#define TEMP_R0 TGSI_EXEC_TEMP_R0
-#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+/**
+ * How many TGSI temps should be implemented with real PPC vector registers
+ * rather than memory.
+ */
+#define MAX_PPC_TEMPS 4
+
+
+struct reg_chan_vec
+{
+ struct tgsi_full_src_register src;
+ uint chan;
+ uint vec;
+};
/**
@@ -92,12 +102,105 @@ struct gen_context
int const_reg; /**< GP register pointing to constants buffer */
int builtins_reg; /**< GP register pointint to built-in constants */
+ int offset_reg; /**< used to reduce redundant li instructions */
+ int offset_value;
+
int one_vec; /**< vector register with {1.0, 1.0, 1.0, 1.0} */
int bit31_vec; /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+
+ /**
+ * Map TGSI temps to PPC vector temps.
+ * We have 32 PPC vector regs. Use 16 of them for storing 4 TGSI temps.
+ * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1].
+ */
+ int temps_map[MAX_PPC_TEMPS][4];
+
+ /**
+ * Cache of src registers.
+ * This is used to avoid redundant load instructions.
+ */
+ struct {
+ struct tgsi_full_src_register src;
+ uint chan;
+ uint vec;
+ } regs[12]; /* 3 src regs, 4 channels */
+ uint num_regs;
};
/**
+ * Initialize code generation context.
+ */
+static void
+init_gen_context(struct gen_context *gen, struct ppc_function *func)
+{
+ uint i;
+
+ memset(gen, 0, sizeof(*gen));
+ gen->f = func;
+ gen->inputs_reg = ppc_reserve_register(func, 3); /* first function param */
+ gen->outputs_reg = ppc_reserve_register(func, 4); /* second function param */
+ gen->temps_reg = ppc_reserve_register(func, 5); /* ... */
+ gen->immed_reg = ppc_reserve_register(func, 6);
+ gen->const_reg = ppc_reserve_register(func, 7);
+ gen->builtins_reg = ppc_reserve_register(func, 8);
+ gen->one_vec = -1;
+ gen->bit31_vec = -1;
+ gen->offset_reg = -1;
+ gen->offset_value = -9999999;
+ for (i = 0; i < MAX_PPC_TEMPS; i++) {
+ gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f);
+ gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f);
+ gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f);
+ gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f);
+ }
+}
+
+
+/**
+ * All PPC vector load/store instructions form an effective address
+ * by adding the contents of two registers. For example:
+ * lvx v2,r8,r9 # v2 = memory[r8 + r9]
+ * stvx v2,r8,r9 # memory[r8 + r9] = v2;
+ * So our lvx/stvx instructions are typically preceded by an 'li' instruction
+ * to load r9 (above) with an immediate (an offset).
+ * This code emits that 'li' instruction, but only if the offset value is
+ * different than the previous 'li'.
+ * This optimization seems to save about 10% in the instruction count.
+ * Note that we need to unconditionally emit an 'li' inside basic blocks
+ * (such as inside loops).
+ */
+static int
+emit_li_offset(struct gen_context *gen, int offset)
+{
+ if (gen->offset_reg <= 0) {
+ /* allocate a GP register for storing load/store offset */
+ gen->offset_reg = ppc_allocate_register(gen->f);
+ }
+
+ /* emit new 'li' if offset is changing */
+ if (gen->offset_value < 0 || gen->offset_value != offset) {
+ gen->offset_value = offset;
+ ppc_li(gen->f, gen->offset_reg, offset);
+ }
+
+ return gen->offset_reg;
+}
+
+
+/**
+ * Forces subsequent emit_li_offset() calls to emit an 'li'.
+ * To be called at the top of basic blocks.
+ */
+static void
+reset_li_offset(struct gen_context *gen)
+{
+ gen->offset_value = -9999999;
+}
+
+
+
+/**
* Load the given vector register with {value, value, value, value}.
* The value must be in the ppu_builtin_constants[] array.
* We wouldn't need this if there was a simple way to load PPC vector
@@ -109,10 +212,9 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value)
uint pos;
for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
if (ppc_builtin_constants[pos] == value) {
- int offset_reg = ppc_allocate_register(gen->f);
int offset = pos * 4;
+ int offset_reg = emit_li_offset(gen, offset);
- ppc_li(gen->f, offset_reg, offset);
/* Load 4-byte word into vector register.
* The vector slot depends on the effective address we load from.
* We know that our builtins start at a 16-byte boundary so we
@@ -122,7 +224,6 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value)
ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
/* splat word[pos % 4] across the vector reg */
ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
- ppc_release_register(gen->f, offset_reg);
return;
}
}
@@ -159,15 +260,15 @@ gen_get_bit31_vec(struct gen_context *gen)
/**
- * Register fetch, put result in 'dst_vec'.
+ * Register fetch. Return PPC vector register with result.
*/
-static void
+static int
emit_fetch(struct gen_context *gen,
- unsigned dst_vec,
const struct tgsi_full_src_register *reg,
const unsigned chan_index)
{
uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+ int dst_vec = -1;
switch (swizzle) {
case TGSI_EXTSWIZZLE_X:
@@ -177,36 +278,46 @@ emit_fetch(struct gen_context *gen,
switch (reg->SrcRegister.File) {
case TGSI_FILE_INPUT:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
case TGSI_FILE_TEMPORARY:
- {
- int offset_reg = ppc_allocate_register(gen->f);
+ if (reg->SrcRegister.Index < MAX_PPC_TEMPS) {
+ /* use PPC vec register */
+ dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle];
+ }
+ else {
+ /* use memory-based temp register "file" */
int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
case TGSI_FILE_IMMEDIATE:
{
- int offset_reg = ppc_allocate_register(gen->f);
- int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
- ppc_li(gen->f, offset_reg, offset);
- ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
+ int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
+ /* Load 4-byte word into vector register.
+ * The vector slot depends on the effective address we load from.
+ * We know that our immediates start at a 16-byte boundary so we
+ * know that 'swizzle' tells us which vector slot will have the
+ * loaded word. The other vector slots will be undefined.
+ */
+ ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+ /* splat word[swizzle] across the vector reg */
+ ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
}
break;
case TGSI_FILE_CONSTANT:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
/* Load 4-byte word into vector register.
* The vector slot depends on the effective address we load from.
* We know that our constants start at a 16-byte boundary so we
@@ -216,7 +327,6 @@ emit_fetch(struct gen_context *gen,
ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
/* splat word[swizzle] across the vector reg */
ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
- ppc_release_register(gen->f, offset_reg);
}
break;
default:
@@ -229,6 +339,7 @@ emit_fetch(struct gen_context *gen,
case TGSI_EXTSWIZZLE_ONE:
{
int one_vec = gen_one_vec(gen);
+ dst_vec = ppc_allocate_vec_register(gen->f);
ppc_vmove(gen->f, dst_vec, one_vec);
}
break;
@@ -236,6 +347,8 @@ emit_fetch(struct gen_context *gen,
assert( 0 );
}
+ assert(dst_vec >= 0);
+
{
uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
if (sign_op != TGSI_UTIL_SIGN_KEEP) {
@@ -259,40 +372,148 @@ emit_fetch(struct gen_context *gen,
}
}
}
+
+ return dst_vec;
}
-#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
- emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+/**
+ * Test if two TGSI src registers refer to the same memory location.
+ * We use this to avoid redundant register loads.
+ */
+static boolean
+equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
+ const struct tgsi_full_src_register *b, uint chan_b)
+{
+ int swz_a, swz_b;
+ int sign_a, sign_b;
+ if (a->SrcRegister.File != b->SrcRegister.File)
+ return FALSE;
+ if (a->SrcRegister.Index != b->SrcRegister.Index)
+ return FALSE;
+ swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
+ swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+ if (swz_a != swz_b)
+ return FALSE;
+ sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
+ sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b);
+ if (sign_a != sign_b)
+ return FALSE;
+ return TRUE;
+}
+
+
+/**
+ * Given a TGSI src register and channel index, return the PPC vector
+ * register containing the value. We use a cache to prevent re-loading
+ * the same register multiple times.
+ * \return index of PPC vector register with the desired src operand
+ */
+static int
+get_src_vec(struct gen_context *gen,
+ struct tgsi_full_instruction *inst, int src_reg, uint chan)
+{
+ const const struct tgsi_full_src_register *src =
+ &inst->FullSrcRegisters[src_reg];
+ int vec;
+ uint i;
+
+ /* check the cache */
+ for (i = 0; i < gen->num_regs; i++) {
+ if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
+ /* cache hit */
+ assert(gen->regs[i].vec >= 0);
+ return gen->regs[i].vec;
+ }
+ }
+
+ /* cache miss: allocate new vec reg and emit fetch/load code */
+ vec = emit_fetch(gen, src, chan);
+ gen->regs[gen->num_regs].src = *src;
+ gen->regs[gen->num_regs].chan = chan;
+ gen->regs[gen->num_regs].vec = vec;
+ gen->num_regs++;
+
+ assert(gen->num_regs <= Elements(gen->regs));
+
+ assert(vec >= 0);
+
+ return vec;
+}
+
+
+/**
+ * Clear the src operand cache. To be called at the end of each emit function.
+ */
+static void
+release_src_vecs(struct gen_context *gen)
+{
+ uint i;
+ for (i = 0; i < gen->num_regs; i++) {
+ const const struct tgsi_full_src_register src = gen->regs[i].src;
+ if (!(src.SrcRegister.File == TGSI_FILE_TEMPORARY &&
+ src.SrcRegister.Index < MAX_PPC_TEMPS)) {
+ ppc_release_vec_register(gen->f, gen->regs[i].vec);
+ }
+ }
+ gen->num_regs = 0;
+}
+
+
+
+static int
+get_dst_vec(struct gen_context *gen,
+ const struct tgsi_full_instruction *inst,
+ unsigned chan_index)
+{
+ const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+ if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+ reg->DstRegister.Index < MAX_PPC_TEMPS) {
+ int vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+ return vec;
+ }
+ else {
+ return ppc_allocate_vec_register(gen->f);
+ }
+}
+
/**
* Register store. Store 'src_vec' at location indicated by 'reg'.
+ * \param free_vec Should the src_vec be released when done?
*/
static void
emit_store(struct gen_context *gen,
- unsigned src_vec,
- const struct tgsi_full_dst_register *reg,
+ int src_vec,
const struct tgsi_full_instruction *inst,
- unsigned chan_index)
+ unsigned chan_index,
+ boolean free_vec)
{
+ const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
switch (reg->DstRegister.File) {
case TGSI_FILE_OUTPUT:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
case TGSI_FILE_TEMPORARY:
- {
- int offset_reg = ppc_allocate_register(gen->f);
+ if (reg->DstRegister.Index < MAX_PPC_TEMPS) {
+ if (!free_vec) {
+ int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+ if (dst_vec != src_vec)
+ ppc_vmove(gen->f, dst_vec, src_vec);
+ }
+ free_vec = FALSE;
+ }
+ else {
int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
#if 0
@@ -322,22 +543,20 @@ emit_store(struct gen_context *gen,
break;
}
#endif
-}
-
-
-#define STORE( GEN, INST, XMM, INDEX, CHAN )\
- emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+ if (free_vec)
+ ppc_release_vec_register(gen->f, src_vec);
+}
static void
emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
{
- int v0 = ppc_allocate_vec_register(gen->f);
- int v1 = ppc_allocate_vec_register(gen->f);
+ int v0, v1;
uint chan_index;
- FETCH(gen, *inst, v0, 0, CHAN_X);
+ v0 = get_src_vec(gen, inst, 0, CHAN_X);
+ v1 = ppc_allocate_vec_register(gen->f);
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_RSQ:
@@ -353,9 +572,10 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
}
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE(gen, *inst, v1, 0, chan_index);
+ emit_store(gen, v1, inst, chan_index, FALSE);
}
- ppc_release_vec_register(gen->f, v0);
+
+ release_src_vecs(gen);
ppc_release_vec_register(gen->f, v1);
}
@@ -363,61 +583,65 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
static void
emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
{
- int v0 = ppc_allocate_vec_register(gen->f);
uint chan_index;
FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
- FETCH(gen, *inst, 0, 0, chan_index); /* v0 = srcreg[0] */
+ int v0 = get_src_vec(gen, inst, 0, chan_index); /* v0 = srcreg[0] */
+ int v1 = get_dst_vec(gen, inst, chan_index);
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_ABS:
/* turn off the most significant bit of each vector float word */
{
- int v1 = ppc_allocate_vec_register(gen->f);
- ppc_vspltisw(gen->f, v1, -1); /* v1 = {-1, -1, -1, -1} */
- ppc_vslw(gen->f, v1, v1, v1); /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
- ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
- ppc_release_vec_register(gen->f, v1);
+ int bit31_vec = gen_get_bit31_vec(gen);
+ ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */
}
break;
case TGSI_OPCODE_FLOOR:
- ppc_vrfim(gen->f, v0, v0); /* v0 = floor(v0) */
+ ppc_vrfim(gen->f, v1, v0); /* v1 = floor(v0) */
break;
case TGSI_OPCODE_FRAC:
- {
- int v1 = ppc_allocate_vec_register(gen->f);
- ppc_vrfim(gen->f, v1, v0); /* v1 = floor(v0) */
- ppc_vsubfp(gen->f, v0, v0, v1); /* v0 = v0 - v1 */
- ppc_release_vec_register(gen->f, v1);
- }
+ ppc_vrfim(gen->f, v1, v0); /* tmp = floor(v0) */
+ ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */
break;
case TGSI_OPCODE_EXPBASE2:
- ppc_vexptefp(gen->f, v0, v0); /* v0 = 2^v0 */
+ ppc_vexptefp(gen->f, v1, v0); /* v1 = 2^v0 */
break;
case TGSI_OPCODE_LOGBASE2:
/* XXX this may be broken! */
- ppc_vlogefp(gen->f, v0, v0); /* v0 = log2(v0) */
+ ppc_vlogefp(gen->f, v1, v0); /* v1 = log2(v0) */
break;
case TGSI_OPCODE_MOV:
- /* nothing */
+ case TGSI_OPCODE_SWZ:
+ if (v0 != v1)
+ ppc_vmove(gen->f, v1, v0);
break;
default:
assert(0);
}
- STORE(gen, *inst, v0, 0, chan_index); /* store v0 */
+ emit_store(gen, v1, inst, chan_index, TRUE); /* store v0 */
}
- ppc_release_vec_register(gen->f, v0);
+
+ release_src_vecs(gen);
}
static void
emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
{
- int v0 = ppc_allocate_vec_register(gen->f);
- int v1 = ppc_allocate_vec_register(gen->f);
- int v2 = ppc_allocate_vec_register(gen->f);
- uint chan_index;
- FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
- FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */
- FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */
+ int zero_vec = -1;
+ uint chan;
+
+ if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+ zero_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vzero(gen->f, zero_vec);
+ }
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ /* fetch src operands */
+ int v0 = get_src_vec(gen, inst, 0, chan);
+ int v1 = get_src_vec(gen, inst, 1, chan);
+ int v2 = get_dst_vec(gen, inst, chan);
+
+ /* emit binop */
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_ADD:
ppc_vaddfp(gen->f, v2, v0, v1);
@@ -426,8 +650,7 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
ppc_vsubfp(gen->f, v2, v0, v1);
break;
case TGSI_OPCODE_MUL:
- ppc_vxor(gen->f, v2, v2, v2); /* v2 = {0, 0, 0, 0} */
- ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+ ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec);
break;
case TGSI_OPCODE_MIN:
ppc_vminfp(gen->f, v2, v0, v1);
@@ -438,11 +661,48 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
default:
assert(0);
}
- STORE(gen, *inst, v2, 0, chan_index); /* store v2 */
+
+ /* store v2 */
+ emit_store(gen, v2, inst, chan, TRUE);
}
- ppc_release_vec_register(gen->f, v0);
- ppc_release_vec_register(gen->f, v1);
- ppc_release_vec_register(gen->f, v2);
+
+ if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
+ ppc_release_vec_register(gen->f, zero_vec);
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ uint chan;
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ /* fetch src operands */
+ int v0 = get_src_vec(gen, inst, 0, chan);
+ int v1 = get_src_vec(gen, inst, 1, chan);
+ int v2 = get_src_vec(gen, inst, 2, chan);
+ int v3 = get_dst_vec(gen, inst, chan);
+
+ /* emit ALU */
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_MAD:
+ ppc_vmaddfp(gen->f, v3, v0, v1, v2); /* v3 = v0 * v1 + v2 */
+ break;
+ case TGSI_OPCODE_LRP:
+ ppc_vsubfp(gen->f, v3, v1, v2); /* v3 = v1 - v2 */
+ ppc_vmaddfp(gen->f, v3, v0, v3, v2); /* v3 = v0 * v3 + v2 */
+ break;
+ default:
+ assert(0);
+ }
+
+ /* store v3 */
+ emit_store(gen, v3, inst, chan, TRUE);
+ }
+
+ release_src_vecs(gen);
}
@@ -452,16 +712,15 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
static void
emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
{
- int v0 = ppc_allocate_vec_register(gen->f);
- int v1 = ppc_allocate_vec_register(gen->f);
- int v2 = ppc_allocate_vec_register(gen->f);
- uint chan_index;
- boolean complement = FALSE;
+ uint chan;
int one_vec = gen_one_vec(gen);
- FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
- FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */
- FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ /* fetch src operands */
+ int v0 = get_src_vec(gen, inst, 0, chan);
+ int v1 = get_src_vec(gen, inst, 1, chan);
+ int v2 = get_dst_vec(gen, inst, chan);
+ boolean complement = FALSE;
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_SNE:
@@ -495,89 +754,56 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
else
ppc_vand(gen->f, v2, one_vec, v2); /* v2 = one_vec & v2 */
- STORE(gen, *inst, v2, 0, chan_index); /* store v2 */
+ /* store v2 */
+ emit_store(gen, v2, inst, chan, TRUE);
}
- ppc_release_vec_register(gen->f, v0);
- ppc_release_vec_register(gen->f, v1);
- ppc_release_vec_register(gen->f, v2);
+ release_src_vecs(gen);
}
static void
emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
{
- int v0 = ppc_allocate_vec_register(gen->f);
- int v1 = ppc_allocate_vec_register(gen->f);
- int v2 = ppc_allocate_vec_register(gen->f);
+ int v0, v1, v2;
uint chan_index;
+ v2 = ppc_allocate_vec_register(gen->f);
+
ppc_vxor(gen->f, v2, v2, v2); /* v2 = {0, 0, 0, 0} */
- FETCH(gen, *inst, v0, 0, CHAN_X); /* v0 = src0.XXXX */
- FETCH(gen, *inst, v1, 1, CHAN_X); /* v1 = src1.XXXX */
+ v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */
+ v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */
ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
- FETCH(gen, *inst, v0, 0, CHAN_Y); /* v0 = src0.YYYY */
- FETCH(gen, *inst, v1, 1, CHAN_Y); /* v1 = src1.YYYY */
+ v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */
+ v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */
ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
- FETCH(gen, *inst, v0, 0, CHAN_Z); /* v0 = src0.ZZZZ */
- FETCH(gen, *inst, v1, 1, CHAN_Z); /* v1 = src1.ZZZZ */
+ v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */
+ v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */
ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
- FETCH(gen, *inst, v0, 0, CHAN_W); /* v0 = src0.WWWW */
- FETCH(gen, *inst, v1, 1, CHAN_W); /* v1 = src1.WWWW */
- ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+ v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */
+ v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
}
else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
- FETCH(gen, *inst, v1, 1, CHAN_W); /* v1 = src1.WWWW */
- ppc_vaddfp(gen->f, v2, v2, v1); /* v2 = v2 + v1 */
+ v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+ ppc_vaddfp(gen->f, v2, v2, v1); /* v2 = v2 + v1 */
}
FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
- STORE(gen, *inst, v2, 0, chan_index); /* store v2 */
+ emit_store(gen, v2, inst, chan_index, FALSE); /* store v2, free v2 later */
}
- ppc_release_vec_register(gen->f, v0);
- ppc_release_vec_register(gen->f, v1);
- ppc_release_vec_register(gen->f, v2);
-}
+ release_src_vecs(gen);
-static void
-emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
- int v0 = ppc_allocate_vec_register(gen->f);
- int v1 = ppc_allocate_vec_register(gen->f);
- int v2 = ppc_allocate_vec_register(gen->f);
- int v3 = ppc_allocate_vec_register(gen->f);
- uint chan_index;
- FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
- FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */
- FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */
- FETCH(gen, *inst, v2, 2, chan_index); /* v2 = srcreg[2] */
- switch (inst->Instruction.Opcode) {
- case TGSI_OPCODE_MAD:
- ppc_vmaddfp(gen->f, v3, v0, v1, v2); /* v3 = v0 * v1 + v2 */
- break;
- case TGSI_OPCODE_LRP:
- ppc_vsubfp(gen->f, v3, v1, v2); /* v3 = v1 - v2 */
- ppc_vmaddfp(gen->f, v3, v0, v3, v2); /* v3 = v0 * v3 + v2 */
- break;
- default:
- assert(0);
- }
- STORE(gen, *inst, v3, 0, chan_index); /* store v3 */
- }
- ppc_release_vec_register(gen->f, v0);
- ppc_release_vec_register(gen->f, v1);
ppc_release_vec_register(gen->f, v2);
- ppc_release_vec_register(gen->f, v3);
}
-
/** Approximation for vr = pow(va, vb) */
static void
ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
@@ -604,43 +830,42 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
/* Compute X */
if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
- STORE(gen, *inst, one_vec, 0, CHAN_X);
+ emit_store(gen, one_vec, inst, CHAN_X, FALSE);
}
/* Compute Y, Z */
if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
- int x_vec = ppc_allocate_vec_register(gen->f);
+ int x_vec;
int zero_vec = ppc_allocate_vec_register(gen->f);
- FETCH(gen, *inst, x_vec, 0, CHAN_X); /* x_vec = src[0].x */
+ x_vec = get_src_vec(gen, inst, 0, CHAN_X); /* x_vec = src[0].x */
ppc_vzero(gen->f, zero_vec); /* zero = {0,0,0,0} */
ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
- STORE(gen, *inst, x_vec, 0, CHAN_Y); /* store Y */
+ emit_store(gen, x_vec, inst, CHAN_Y, FALSE);
}
if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
- int y_vec = ppc_allocate_vec_register(gen->f);
+ int y_vec, w_vec;
int z_vec = ppc_allocate_vec_register(gen->f);
- int w_vec = ppc_allocate_vec_register(gen->f);
int pow_vec = ppc_allocate_vec_register(gen->f);
int pos_vec = ppc_allocate_vec_register(gen->f);
int p128_vec = ppc_allocate_vec_register(gen->f);
int n128_vec = ppc_allocate_vec_register(gen->f);
- FETCH(gen, *inst, y_vec, 0, CHAN_Y); /* y_vec = src[0].y */
+ y_vec = get_src_vec(gen, inst, 0, CHAN_Y); /* y_vec = src[0].y */
ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
- FETCH(gen, *inst, w_vec, 0, CHAN_W); /* w_vec = src[0].w */
+ w_vec = get_src_vec(gen, inst, 0, CHAN_W); /* w_vec = src[0].w */
- /* clamp Y to [-128, 128] */
+ /* clamp W to [-128, 128] */
load_constant_vec(gen, p128_vec, 128.0f);
load_constant_vec(gen, n128_vec, -128.0f);
- ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
- ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
+ ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */
+ ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */
/* if temp.x > 0
* z = pow(tmp.y, tmp.w)
@@ -651,34 +876,216 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
ppc_vand(gen->f, z_vec, pow_vec, pos_vec); /* z = pow & pos */
- STORE(gen, *inst, z_vec, 0, CHAN_Z); /* store Z */
+ emit_store(gen, z_vec, inst, CHAN_Z, FALSE);
- ppc_release_vec_register(gen->f, y_vec);
ppc_release_vec_register(gen->f, z_vec);
- ppc_release_vec_register(gen->f, w_vec);
ppc_release_vec_register(gen->f, pow_vec);
ppc_release_vec_register(gen->f, pos_vec);
ppc_release_vec_register(gen->f, p128_vec);
ppc_release_vec_register(gen->f, n128_vec);
}
- ppc_release_vec_register(gen->f, x_vec);
ppc_release_vec_register(gen->f, zero_vec);
}
/* Compute W */
if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
- STORE(gen, *inst, one_vec, 0, CHAN_W);
+ emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+ }
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ const int one_vec = gen_one_vec(gen);
+ int src_vec;
+
+ /* get src arg */
+ src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+ /* Compute X = 2^floor(src) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_X);
+ int tmp_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */
+ ppc_vexptefp(gen->f, dst_vec, tmp_vec); /* dst = 2 ^ tmp */
+ emit_store(gen, dst_vec, inst, CHAN_X, TRUE);
+ ppc_release_vec_register(gen->f, tmp_vec);
+ }
+
+ /* Compute Y = src - floor(src) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_Y);
+ int tmp_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */
+ ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec); /* dst = src - tmp */
+ emit_store(gen, dst_vec, inst, CHAN_Y, TRUE);
+ ppc_release_vec_register(gen->f, tmp_vec);
+ }
+
+ /* Compute Z = RoughApprox2ToX(src) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+ ppc_vexptefp(gen->f, dst_vec, src_vec); /* dst = 2 ^ src */
+ emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+ }
+
+ /* Compute W = 1.0 */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+ emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+ }
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ const int bit31_vec = gen_get_bit31_vec(gen);
+ const int one_vec = gen_one_vec(gen);
+ int src_vec, abs_vec;
+
+ /* get src arg */
+ src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+ /* compute abs(src) */
+ abs_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec); /* abs = src & ~bit31 */
+
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) &&
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+
+ /* compute tmp = floor(log2(abs)) */
+ int tmp_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vlogefp(gen->f, tmp_vec, abs_vec); /* tmp = log2(abs) */
+ ppc_vrfim(gen->f, tmp_vec, tmp_vec); /* tmp = floor(tmp); */
+
+ /* Compute X = tmp */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+ emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+ }
+
+ /* Compute Y = abs / 2^tmp */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ const int zero_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vzero(gen->f, zero_vec);
+ ppc_vexptefp(gen->f, tmp_vec, tmp_vec); /* tmp = 2 ^ tmp */
+ ppc_vrefp(gen->f, tmp_vec, tmp_vec); /* tmp = 1 / tmp */
+ /* tmp = abs * tmp + zero */
+ ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+ ppc_release_vec_register(gen->f, zero_vec);
+ }
+
+ ppc_release_vec_register(gen->f, tmp_vec);
+ }
+
+ /* Compute Z = RoughApproxLog2(abs) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+ ppc_vlogefp(gen->f, dst_vec, abs_vec); /* dst = log2(abs) */
+ emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+ }
+
+ /* Compute W = 1.0 */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+ emit_store(gen, one_vec, inst, CHAN_W, FALSE);
}
+
+ ppc_release_vec_register(gen->f, abs_vec);
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int s0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+ int s1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+ int pow_vec = ppc_allocate_vec_register(gen->f);
+ int chan;
+
+ ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec);
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ emit_store(gen, pow_vec, inst, chan, FALSE);
+ }
+
+ ppc_release_vec_register(gen->f, pow_vec);
+
+ release_src_vecs(gen);
}
+static void
+emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int x0_vec, y0_vec, z0_vec;
+ int x1_vec, y1_vec, z1_vec;
+ int zero_vec, tmp_vec;
+ int tmp2_vec;
+
+ zero_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vzero(gen->f, zero_vec);
+
+ tmp_vec = ppc_allocate_vec_register(gen->f);
+ tmp2_vec = ppc_allocate_vec_register(gen->f);
+
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ x0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+ x1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+ }
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ y0_vec = get_src_vec(gen, inst, 0, CHAN_Y);
+ y1_vec = get_src_vec(gen, inst, 1, CHAN_Y);
+ }
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ z0_vec = get_src_vec(gen, inst, 0, CHAN_Z);
+ z1_vec = get_src_vec(gen, inst, 1, CHAN_Z);
+ }
+
+ IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) {
+ /* tmp = y0 * z1 */
+ ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec);
+ /* tmp = tmp - z0 * y1*/
+ ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+ }
+ IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) {
+ /* tmp = z0 * x1 */
+ ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec);
+ /* tmp = tmp - x0 * z1 */
+ ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+ }
+ IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) {
+ /* tmp = x0 * y1 */
+ ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec);
+ /* tmp = tmp - y0 * x1 */
+ ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE);
+ }
+ /* W is undefined */
+
+ ppc_release_vec_register(gen->f, tmp_vec);
+ ppc_release_vec_register(gen->f, zero_vec);
+ release_src_vecs(gen);
+}
+
static int
emit_instruction(struct gen_context *gen,
struct tgsi_full_instruction *inst)
{
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_MOV:
+ case TGSI_OPCODE_SWZ:
case TGSI_OPCODE_ABS:
case TGSI_OPCODE_FLOOR:
case TGSI_OPCODE_FRAC:
@@ -717,17 +1124,28 @@ emit_instruction(struct gen_context *gen,
case TGSI_OPCODE_LIT:
emit_lit(gen, inst);
break;
+ case TGSI_OPCODE_LOG:
+ emit_log(gen, inst);
+ break;
+ case TGSI_OPCODE_EXP:
+ emit_exp(gen, inst);
+ break;
+ case TGSI_OPCODE_POW:
+ emit_pow(gen, inst);
+ break;
+ case TGSI_OPCODE_XPD:
+ emit_xpd(gen, inst);
+ break;
case TGSI_OPCODE_END:
/* normal end */
return 1;
default:
return 0;
}
-
-
return 1;
}
+
static void
emit_declaration(
struct ppc_function *func,
@@ -805,6 +1223,7 @@ emit_epilogue(struct ppc_function *func)
{
ppc_return(func);
/* XXX restore prev stack frame */
+ debug_printf("PPC: Emitted %u instructions\n", func->num_inst);
}
@@ -837,17 +1256,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
if (!use_ppc_asm)
return FALSE;
+ if (0) {
+ debug_printf("\n********* TGSI->PPC ********\n");
+ tgsi_dump(tokens, 0);
+ }
+
util_init_math();
- gen.f = func;
- gen.inputs_reg = ppc_reserve_register(func, 3); /* first function param */
- gen.outputs_reg = ppc_reserve_register(func, 4); /* second function param */
- gen.temps_reg = ppc_reserve_register(func, 5); /* ... */
- gen.immed_reg = ppc_reserve_register(func, 6);
- gen.const_reg = ppc_reserve_register(func, 7);
- gen.builtins_reg = ppc_reserve_register(func, 8);
- gen.one_vec = -1;
- gen.bit31_vec = -1;
+ init_gen_context(&gen, func);
emit_prologue(func);
@@ -878,19 +1294,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
/* splat each immediate component into a float[4] vector for SoA */
{
const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
- float *imm = (float *) immediates;
uint i;
assert(size <= 4);
assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
for (i = 0; i < size; i++) {
- const float value =
- parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
- imm[num_immediates * 4 + 0] =
- imm[num_immediates * 4 + 1] =
- imm[num_immediates * 4 + 2] =
- imm[num_immediates * 4 + 3] = value;
- num_immediates++;
+ immediates[num_immediates][i] =
+ parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
}
+ num_immediates++;
}
break;
@@ -904,6 +1315,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
tgsi_parse_free( &parse );
+ if (ppc_num_instructions(func) == 0) {
+ /* ran out of memory for instructions */
+ ok = FALSE;
+ }
+
+ if (!ok)
+ debug_printf("TGSI->PPC translation failed\n");
+
return ok;
}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 11659247c0..bc7b941b78 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -153,17 +153,21 @@ check_register_usage(
if (!check_file_name( ctx, file ))
return FALSE;
- if (index < 0 || index > MAX_REGISTERS) {
- report_error( ctx, "%s[%i]: Invalid index %s", file_names[file], index, name );
- return FALSE;
- }
-
if (indirect_access) {
+ /* Note that 'index' is an offset relative to the value of the
+ * address register. No range checking done here.
+ */
if (!is_any_register_declared( ctx, file ))
report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
ctx->regs_ind_used[file] = TRUE;
}
else {
+ if (index < 0 || index > MAX_REGISTERS) {
+ report_error( ctx, "%s[%i]: Invalid index %s",
+ file_names[file], index, name );
+ return FALSE;
+ }
+
if (!is_register_declared( ctx, file, index ))
report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name );
ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG));
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index f79170b9d6..f93db18114 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -27,12 +27,14 @@
#include "pipe/p_config.h"
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
#include "pipe/p_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
#include "util/u_sse.h"
+#endif
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi_exec.h"
@@ -72,6 +74,9 @@
#define TEMP_R0 TGSI_EXEC_TEMP_R0
#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
+#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
+
/**
* X86 utility functions.
@@ -233,6 +238,9 @@ emit_const(
int indirectIndex )
{
if (indirect) {
+ /* 'vec' is the offset from the address register's value.
+ * We're loading CONST[ADDR+vec] into an xmm register.
+ */
struct x86_reg r0 = get_input_base();
struct x86_reg r1 = get_output_base();
uint i;
@@ -243,18 +251,40 @@ emit_const(
x86_push( func, r0 );
x86_push( func, r1 );
+ /*
+ * Loop over the four pixels or vertices in the quad.
+ * Get the value of the address (offset) register for pixel/vertex[i],
+ * add it to the src offset and index into the constant buffer.
+ * Note that we're working on SOA data.
+ * If any of the pixel/vertex execution channels are unused their
+ * values will be garbage. It's very important that we don't use
+ * those garbage values as indexes into the constant buffer since
+ * that'll cause segfaults.
+ * The solution is to bitwise-AND the offset with the execution mask
+ * register whose values are either 0 or ~0.
+ * The caller must setup the execution mask register to indicate
+ * which channels are valid/alive before running the shader.
+ * The execution mask will also figure into loops and conditionals
+ * someday.
+ */
for (i = 0; i < QUAD_SIZE; i++) {
- x86_lea( func, r0, get_const( vec, chan ) );
+ /* r1 = address register[i] */
x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+ /* r0 = execution mask[i] */
+ x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
+ /* r1 = r1 & r0 */
+ x86_and( func, r1, r0 );
+ /* r0 = 'vec', the offset */
+ x86_lea( func, r0, get_const( vec, chan ) );
- /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+ /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
*/
x86_add( func, r1, r1 );
x86_add( func, r1, r1 );
x86_add( func, r1, r1 );
x86_add( func, r1, r1 );
- x86_add( func, r0, r1 );
+ x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
x86_mov( func, r1, x86_deref( r0 ) );
x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
}
@@ -268,6 +298,7 @@ emit_const(
get_temp( TEMP_R0, CHAN_X ) );
}
else {
+ /* 'vec' is the index into the src register file, such as TEMP[vec] */
assert( vec >= 0 );
sse_movss(
@@ -598,6 +629,9 @@ emit_func_call_dst_src(
code );
}
+
+#if defined(PIPE_ARCH_SSE)
+
/*
* Fast SSE2 implementation of special math functions.
*/
@@ -649,6 +683,7 @@ exp2f4(__m128 x)
return _mm_mul_ps(expipart, expfpart);
}
+
/**
* See http://www.devmaster.net/forums/showthread.php?p=43580
*/
@@ -691,12 +726,16 @@ log2f4(__m128 x)
return _mm_add_ps(logmant, exp);
}
+
static INLINE __m128
powf4(__m128 x, __m128 y)
{
return exp2f4(_mm_mul_ps(log2f4(x), y));
}
+#endif /* PIPE_ARCH_SSE */
+
+
/**
* Low-level instruction translators.
@@ -751,13 +790,20 @@ emit_cos(
}
static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
__attribute__((force_align_arg_pointer))
#endif
ex24f(
float *store )
{
+#if defined(PIPE_ARCH_SSE)
_mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
+ store[0] = util_fast_exp2( store[0] );
+ store[1] = util_fast_exp2( store[1] );
+ store[2] = util_fast_exp2( store[2] );
+ store[3] = util_fast_exp2( store[3] );
+#endif
}
static void
@@ -784,6 +830,17 @@ emit_f2it(
make_xmm( xmm ) );
}
+static void
+emit_i2f(
+ struct x86_function *func,
+ unsigned xmm )
+{
+ sse2_cvtdq2ps(
+ func,
+ make_xmm( xmm ),
+ make_xmm( xmm ) );
+}
+
static void PIPE_CDECL
flr4f(
float *store )
@@ -831,13 +888,20 @@ emit_frc(
}
static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
__attribute__((force_align_arg_pointer))
#endif
lg24f(
float *store )
{
+#if defined(PIPE_ARCH_SSE)
_mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
+ store[0] = util_fast_log2( store[0] );
+ store[1] = util_fast_log2( store[1] );
+ store[2] = util_fast_log2( store[2] );
+ store[3] = util_fast_log2( store[3] );
+#endif
}
static void
@@ -890,19 +954,19 @@ emit_neg(
}
static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
__attribute__((force_align_arg_pointer))
#endif
pow4f(
float *store )
{
-#if 1
+#if defined(PIPE_ARCH_SSE)
_mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
#else
- store[0] = powf( store[0], store[4] );
- store[1] = powf( store[1], store[5] );
- store[2] = powf( store[2], store[6] );
- store[3] = powf( store[3], store[7] );
+ store[0] = util_fast_pow( store[0], store[4] );
+ store[1] = util_fast_pow( store[1], store[5] );
+ store[2] = util_fast_pow( store[2], store[6] );
+ store[3] = util_fast_pow( store[3], store[7] );
#endif
}
@@ -1702,7 +1766,18 @@ emit_instruction(
case TGSI_OPCODE_DOT2ADD:
/* TGSI_OPCODE_DP2A */
- return 0;
+ FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
+ FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
+ emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
+ FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
+ FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
+ emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
+ }
break;
case TGSI_OPCODE_INDEX:
@@ -2036,7 +2111,39 @@ emit_instruction(
break;
case TGSI_OPCODE_NRM:
- return 0;
+ /* fall-through */
+ case TGSI_OPCODE_NRM4:
+ /* 3 or 4-component normalization */
+ {
+ uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+ /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
+ FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */
+ FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */
+ FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */
+ if (dims == 4) {
+ FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
+ }
+ emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */
+ emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */
+ emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */
+ emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */
+ emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
+ emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */
+ emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */
+ emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
+ if (dims == 4) {
+ emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */
+ emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */
+ emit_add( func, 0, 0 ); /* xmm0 += xmm1 */
+ }
+ emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ if (chan_index < dims) {
+ emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
+ STORE( func, *inst, 4+chan_index, 0, chan_index );
+ }
+ }
+ }
break;
case TGSI_OPCODE_DIV:
@@ -2044,7 +2151,16 @@ emit_instruction(
break;
case TGSI_OPCODE_DP2:
- return 0;
+ FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
+ FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
+ emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
+ FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
+ FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
+ emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
+ }
break;
case TGSI_OPCODE_TXL:
@@ -2104,7 +2220,12 @@ emit_instruction(
break;
case TGSI_OPCODE_TRUNC:
- return 0;
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ FETCH( func, *inst, 0, 0, chan_index );
+ emit_f2it( func, 0 );
+ emit_i2f( func, 0 );
+ STORE( func, *inst, 0, 0, chan_index );
+ }
break;
case TGSI_OPCODE_SHL:
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 3ed8bdfdf3..a1a51d7ef2 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -36,6 +36,13 @@
#include <windows.h>
#include <winddi.h>
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <windows.h>
+#include <types.h>
+
#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
#ifndef WIN32_LEAN_AND_MEAN
@@ -98,7 +105,35 @@ void _debug_vprintf(const char *format, va_list ap)
OutputDebugStringA(buf);
buf[0] = '\0';
}
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+ wchar_t *wide_format;
+ long wide_str_len;
+ char buf[512];
+ int ret;
+#if (_WIN32_WCE < 600)
+ ret = vsprintf(buf, format, ap);
+ if(ret < 0){
+ sprintf(buf, "Cant handle debug print!");
+ ret = 25;
+ }
+#else
+ ret = vsprintf_s(buf, 512, format, ap);
+ if(ret < 0){
+ sprintf_s(buf, 512, "Cant handle debug print!");
+ ret = 25;
+ }
+#endif
+ buf[ret] = '\0';
+ /* Format is ascii - needs to be converted to wchar_t for printing */
+ wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);
+ wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));
+ if (wide_format) {
+ MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,
+ wide_format, wide_str_len);
+ NKDbgPrintfW(wide_format, wide_format);
+ free(wide_format);
+ }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
/* TODO */
#else /* !PIPE_SUBSYSTEM_WINDOWS */
vfprintf(stderr, format, ap);
@@ -637,6 +672,7 @@ void
debug_dump_surface_bmp(const char *filename,
struct pipe_surface *surface)
{
+#ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
struct util_stream *stream;
unsigned surface_usage;
struct bmp_file_header bmfh;
@@ -703,6 +739,7 @@ error2:
FREE(rgba);
error1:
;
+#endif
}
#endif
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index be7303e550..d2eaa2e7f7 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -68,7 +68,7 @@ __inline double ceil(double val)
return ceil_val;
}
-#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE_OGL
__inline double floor(double val)
{
double floor_val;
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 0f51dd5977..45ce257b5e 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -31,7 +31,7 @@
void
-mmDumpMemInfo(const struct mem_block *heap)
+u_mmDumpMemInfo(const struct mem_block *heap)
{
debug_printf("Memory heap %p:\n", (void *)heap);
if (heap == 0) {
@@ -58,7 +58,7 @@ mmDumpMemInfo(const struct mem_block *heap)
}
struct mem_block *
-mmInit(int ofs, int size)
+u_mmInit(int ofs, int size)
{
struct mem_block *heap, *block;
@@ -165,13 +165,17 @@ SliceBlock(struct mem_block *p,
struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
{
struct mem_block *p;
const int mask = (1 << align2)-1;
int startofs = 0;
int endofs;
+ assert(size >= 0);
+ assert(align2 >= 0);
+ assert(align2 <= 12); /* sanity check, 2^12 (4KB) enough? */
+
if (!heap || align2 < 0 || size <= 0)
return NULL;
@@ -198,7 +202,7 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
+u_mmFindBlock(struct mem_block *heap, int start)
{
struct mem_block *p;
@@ -237,7 +241,7 @@ Join2Blocks(struct mem_block *p)
}
int
-mmFreeMem(struct mem_block *b)
+u_mmFreeMem(struct mem_block *b)
{
if (!b)
return 0;
@@ -266,7 +270,7 @@ mmFreeMem(struct mem_block *b)
void
-mmDestroy(struct mem_block *heap)
+u_mmDestroy(struct mem_block *heap)
{
struct mem_block *p;
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
index b226b101cb..ce20e48763 100644
--- a/src/gallium/auxiliary/util/u_mm.h
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -49,7 +49,7 @@ struct mem_block {
* input: total size in bytes
* return: a heap pointer if OK, NULL if error
*/
-extern struct mem_block *mmInit(int ofs, int size);
+extern struct mem_block *u_mmInit(int ofs, int size);
/**
* Allocate 'size' bytes with 2^align2 bytes alignment,
@@ -61,7 +61,7 @@ extern struct mem_block *mmInit(int ofs, int size);
* startSearch = linear offset from start of heap to begin search
* return: pointer to the allocated block, 0 if error
*/
-extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2,
+extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2,
int startSearch);
/**
@@ -69,23 +69,23 @@ extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2
* input: pointer to a block
* return: 0 if OK, -1 if error
*/
-extern int mmFreeMem(struct mem_block *b);
+extern int u_mmFreeMem(struct mem_block *b);
/**
* Free block starts at offset
* input: pointer to a heap, start offset
* return: pointer to a block
*/
-extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start);
/**
* destroy MM
*/
-extern void mmDestroy(struct mem_block *mmInit);
+extern void u_mmDestroy(struct mem_block *mmInit);
/**
* For debuging purpose.
*/
-extern void mmDumpMemInfo(const struct mem_block *mmInit);
+extern void u_mmDumpMemInfo(const struct mem_block *mmInit);
#endif
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index f5619ef791..30f32413d7 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -222,7 +222,8 @@ util_surface_copy(struct pipe_context *pipe,
w, h,
src_map,
do_flip ? -(int) src->stride : src->stride,
- src_x, src_y);
+ src_x,
+ do_flip ? w - src_y : src_y);
}
pipe->screen->surface_unmap(pipe->screen, src);
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 853c503f4f..32f6b072a0 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -460,7 +460,7 @@ l8_put_tile_rgba(ubyte *dst,
for (j = 0; j < w; j++, pRow += 4) {
unsigned r;
r = float_to_ubyte(pRow[0]);
- *dst++ = r;
+ *dst++ = (ubyte) r;
}
p += src_stride;
}
@@ -504,7 +504,7 @@ a8_put_tile_rgba(ubyte *dst,
for (j = 0; j < w; j++, pRow += 4) {
unsigned a;
a = float_to_ubyte(pRow[3]);
- *dst++ = a;
+ *dst++ = (ubyte) a;
}
p += src_stride;
}
@@ -634,7 +634,7 @@ i8_put_tile_rgba(ubyte *dst,
for (j = 0; j < w; j++, pRow += 4) {
unsigned r;
r = float_to_ubyte(pRow[0]);
- *dst++ = r;
+ *dst++ = (ubyte) r;
}
p += src_stride;
}
@@ -769,6 +769,32 @@ z24s8_get_tile_rgba(const unsigned *src,
}
+/*** PIPE_FORMAT_Z32_FLOAT ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32f_get_tile_rgba(const float *src,
+ unsigned w, unsigned h,
+ float *p,
+ unsigned dst_stride)
+{
+ unsigned i, j;
+
+ for (i = 0; i < h; i++) {
+ float *pRow = p;
+ for (j = 0; j < w; j++, pRow += 4) {
+ pRow[0] =
+ pRow[1] =
+ pRow[2] =
+ pRow[3] = *src++;
+ }
+ p += dst_stride;
+ }
+}
+
+
/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
/**
@@ -913,6 +939,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
case PIPE_FORMAT_Z24S8_UNORM:
z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
break;
+ case PIPE_FORMAT_Z32_FLOAT:
+ z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
+ break;
case PIPE_FORMAT_YCBCR:
ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
break;
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
index bf7d1d1c8d..57b80e5604 100644
--- a/src/gallium/auxiliary/util/u_time.c
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -200,7 +200,7 @@ util_time_timeout(const struct util_time *start,
}
-#if defined(PIPE_SUBSYSYEM_WINDOWS_DISPLAY)
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
void util_time_sleep(unsigned usecs)
{
LONGLONG start, curr, end;
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 23fb0b0831..87488ea2d7 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -122,7 +122,7 @@
#define CELL_DEBUG_CACHE (1 << 6)
/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+#define SPU_MAX_FRAGMENT_OPS_INSTS 128
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 448b723d85..962775cd33 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell)
const uint batch = cell->cur_batch;
const uint size = cell->buffer_size[batch];
struct cell_command_fence *fence_cmd;
+ struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+ uint i;
+
+ /* set fence status to emitted, not yet signalled */
+ for (i = 0; i < cell->num_spus; i++) {
+ fence->status[i][0] = CELL_FENCE_EMITTED;
+ }
ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
fence_cmd->opcode = CELL_CMD_FENCE;
- fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+ fence_cmd->fence = fence;
+
+ /* update batch buffer size */
+ cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+ assert(sizeof(struct cell_command_fence) % 8 == 0);
}
@@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
{
static boolean flushing = FALSE;
uint batch = cell->cur_batch;
- const uint size = cell->buffer_size[batch];
+ uint size = cell->buffer_size[batch];
uint spu, cmd_word;
assert(!flushing);
@@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell)
/* Before we use this batch buffer, make sure any fenced texture buffers
* are released.
*/
- if (cell->fenced_buffers[batch].head)
+ if (cell->fenced_buffers[batch].head) {
emit_fence(cell);
+ size = cell->buffer_size[batch];
+ }
flushing = TRUE;
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index c9c0c721bb..037635e466 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
clr->surface = surfIndex;
clr->value = clearValue;
}
+
+ /* Technically, the surface's contents are now known and cleared,
+ * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But
+ * it turns out it's quite painful to recognize when any particular
+ * surface goes from PIPE_SURFACE_STATUS_CLEAR to
+ * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+ * the drawing commands could be operating on numerous draw buffers,
+ * which we'd have to iterate through to set all their stati...
+ * For now, we cheat a bit and set the surface's status to DEFINED
+ * right here. Later we should revisit this and set the status to
+ * CLEAR here, and find a better place to set the status to DEFINED.
+ */
+ ps->status = PIPE_SURFACE_STATUS_DEFINED;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 4491ae8cdf..eb1397bb3f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
*/
struct cell_buffer_list
{
- struct cell_fence fence;
+ struct cell_fence fence ALIGN16_ATTRIB;
struct cell_buffer_node *head;
};
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
index ffb3bea12b..867b5dcaa0 100644
--- a/src/gallium/drivers/cell/ppu/cell_fence.c
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -38,6 +38,7 @@ void
cell_fence_init(struct cell_fence *fence)
{
uint i;
+ ASSERT_ALIGN16(fence->status);
for (i = 0; i < CELL_MAX_SPUS; i++) {
fence->status[i][0] = CELL_FENCE_IDLE;
}
@@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell,
{
uint i;
for (i = 0; i < cell->num_spus; i++) {
- //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
- if (fence->status[i][0] == CELL_FENCE_EMITTED)
+ if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
return FALSE;
+ /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
}
return TRUE;
}
@@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell,
while (!cell_fence_signalled(cell, fence)) {
usleep(10);
}
+
+#ifdef DEBUG
+ {
+ uint i;
+ for (i = 0; i < cell->num_spus; i++) {
+ assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+ }
+ }
+#endif
}
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 6596b72010..a64967b4b9 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
flags |= CELL_FLUSH_WAIT;
}
- if (flags & PIPE_FLUSH_SWAPBUFFERS)
+ if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
flags |= CELL_FLUSH_WAIT;
draw_flush( cell->draw );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index d4d644d6e8..5c41b264ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1303,60 +1303,91 @@ lookup_function(struct cell_context *cell, const char *funcname)
/**
* Emit code to call a SPU function.
* Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
*/
static boolean
emit_function_call(struct codegen *gen,
const struct tgsi_full_instruction *inst,
- char *funcname, uint num_args)
+ char *funcname, uint num_args, boolean scalar)
{
const uint addr = lookup_function(gen->cell, funcname);
char comment[100];
- int ch;
+ int s_regs[3];
+ int func_called = FALSE;
+ uint a, ch;
+ int retval_reg = -1;
assert(num_args <= 3);
snprintf(comment, sizeof(comment), "CALL %s:", funcname);
spe_comment(gen->f, -4, comment);
+ if (scalar) {
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+ }
+ /* we'll call the function, put the return value in this register,
+ * then replicate it across all write-enabled components in d_reg.
+ */
+ retval_reg = spe_allocate_available_register(gen->f);
+ }
+
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s_regs[3], d_reg;
+ int d_reg;
ubyte usedRegs[SPE_NUM_REGS];
- uint a, i, numUsed;
+ uint i, numUsed;
- for (a = 0; a < num_args; a++) {
- s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+ if (!scalar) {
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+ }
}
- d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- numUsed = spe_get_registers_used(gen->f, usedRegs);
- assert(numUsed < gen->frame_size / 16 - 2);
+ d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* save registers to stack */
- for (i = 0; i < numUsed; i++) {
- uint reg = usedRegs[i];
- int offset = 2 + i;
- spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
- }
+ if (!scalar || !func_called) {
+ /* for a scalar function, we'll really only call the function once */
- /* setup function arguments */
- for (a = 0; a < num_args; a++) {
- spe_move(gen->f, 3 + a, s_regs[a]);
- }
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 2);
- /* branch to function, save return addr */
- spe_brasl(gen->f, SPE_REG_RA, addr);
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
- /* save function's return value */
- spe_move(gen->f, d_reg, 3);
+ /* setup function arguments */
+ for (a = 0; a < num_args; a++) {
+ spe_move(gen->f, 3 + a, s_regs[a]);
+ }
- /* restore registers from stack */
- for (i = 0; i < numUsed; i++) {
- uint reg = usedRegs[i];
- if (reg != d_reg) {
- int offset = 2 + i;
- spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
+
+ /* save function's return value */
+ if (scalar)
+ spe_move(gen->f, retval_reg, 3);
+ else
+ spe_move(gen->f, d_reg, 3);
+
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_reg && reg != retval_reg) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
}
+
+ func_called = TRUE;
+ }
+
+ if (scalar) {
+ spe_move(gen->f, d_reg, retval_reg);
}
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -1364,6 +1395,10 @@ emit_function_call(struct codegen *gen,
}
}
+ if (scalar) {
+ spe_release_register(gen->f, retval_reg);
+ }
+
return true;
}
@@ -1770,15 +1805,15 @@ emit_instruction(struct codegen *gen,
return emit_END(gen);
case TGSI_OPCODE_COS:
- return emit_function_call(gen, inst, "spu_cos", 1);
+ return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
case TGSI_OPCODE_SIN:
- return emit_function_call(gen, inst, "spu_sin", 1);
+ return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
case TGSI_OPCODE_POW:
- return emit_function_call(gen, inst, "spu_pow", 2);
+ return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
case TGSI_OPCODE_EXPBASE2:
- return emit_function_call(gen, inst, "spu_exp2", 1);
+ return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
case TGSI_OPCODE_LOGBASE2:
- return emit_function_call(gen, inst, "spu_log2", 1);
+ return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
case TGSI_OPCODE_TEX:
/* fall-through for now */
case TGSI_OPCODE_TXD:
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4e1e53ecdc..d9c3ff3f4d 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1141,13 +1141,17 @@ gen_colormask(struct spe_function *f,
* access to the Compare Immediate instructions where we don't in
* gen_depth_test(), which is what makes us very different.
*
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
* The return value in the stencil_pass_reg is a bitmask of valid
* fragments that also passed the stencil test. The bitmask of valid
- * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
*/
static void
gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
- unsigned int mask_reg, unsigned int fbS_reg,
+ unsigned int stencil_max_value,
+ unsigned int fragment_mask_reg, unsigned int fbS_reg,
unsigned int stencil_pass_reg)
{
/* Generate code that puts the set of passing fragments into the stencil_pass_reg
@@ -1155,68 +1159,134 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
*/
switch (state->func) {
case PIPE_FUNC_EQUAL:
- /* stencil_pass = mask & (s == reference) */
- spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
case PIPE_FUNC_NOTEQUAL:
- /* stencil_pass = mask & ~(s == reference) */
- spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & ~(s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
case PIPE_FUNC_GREATER:
- /* stencil_pass = mask & (s > reference) */
- spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- case PIPE_FUNC_LESS: {
- /* stencil_pass = mask & (reference > s) */
- /* There's no convenient Compare Less Than Immediate instruction, so
- * we'll have to do this one the harder way, by loading a register and
- * comparing directly. Compare Logical Greater Than Word (clgt)
- * treats its operands as unsigned - no sign extension.
- */
- unsigned int tmp_reg = spe_allocate_available_register(f);
- spe_load_uint(f, tmp_reg, state->ref_value);
- spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
- spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
- spe_release_register(f, tmp_reg);
+ case PIPE_FUNC_LESS:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference > s) */
+ /* There's no convenient Compare Less Than Immediate instruction, so
+ * we'll have to do this one the harder way, by loading a register and
+ * comparing directly. Compare Logical Greater Than Word (clgt)
+ * treats its operands as unsigned - no sign extension.
+ */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- }
case PIPE_FUNC_LEQUAL:
- /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
- spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s <= reference)
+ * = fragment_mask & ~(s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- case PIPE_FUNC_GEQUAL: {
- /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
- /* As above, we have to do this by loading a register */
- unsigned int tmp_reg = spe_allocate_available_register(f);
- spe_load_uint(f, tmp_reg, state->ref_value);
- spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
- spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
- spe_release_register(f, tmp_reg);
+ case PIPE_FUNC_GEQUAL:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s >= reference) ]
+ * = fragment_mask & ~(reference > s) */
+ /* As above, we have to do this by loading a register */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- }
case PIPE_FUNC_NEVER:
- /* stencil_pass = mask & 0 = 0 */
+ /* stencil_pass = fragment_mask & 0 = 0 */
spe_load_uint(f, stencil_pass_reg, 0);
break;
case PIPE_FUNC_ALWAYS:
- /* stencil_pass = mask & 1 = mask */
- spe_move(f, stencil_pass_reg, mask_reg);
+ /* stencil_pass = fragment_mask & 1 = fragment_mask */
+ spe_move(f, stencil_pass_reg, fragment_mask_reg);
break;
}
/* The fragments that passed the stencil test are now in stencil_pass_reg.
- * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+ * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
*/
}
@@ -1282,7 +1352,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
/* Add Word Immediate computes rT = rA + 10-bit signed immediate */
spe_ai(f, newS_reg, fbS_reg, 1);
/* Select from the current value or the new value based on the equality test */
- spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
spe_release_register(f, equals_reg);
break;
@@ -1295,7 +1365,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
/* Add Word Immediate with a (-1) value works */
spe_ai(f, newS_reg, fbS_reg, -1);
/* Select from the current value or the new value based on the equality test */
- spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
spe_release_register(f, equals_reg);
break;
@@ -1534,15 +1604,28 @@ gen_stencil_depth_test(struct spe_function *f,
* meaning that we have to calculate the stencil values but do not
* need to mask them), we can avoid generating code. Don't forget
* that we need to consider backfacing stencil, if enabled.
+ *
+ * Note that if the backface stencil is *not* enabled, the backface
+ * stencil will have the same values as the frontface stencil.
*/
- if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
- /* Trivial: don't need to calculate stencil values, and don't need to
- * write them back to the framebuffer.
+ if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+ /* No changes to any stencil values */
+ need_to_calculate_stencil_values = false;
+ need_to_writemask_stencil_values = false;
+ }
+ else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+ /* All changes are writemasked out, so no need to calculate
+ * what those changes might be, and no need to write anything back.
*/
need_to_calculate_stencil_values = false;
need_to_writemask_stencil_values = false;
}
- else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+ else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
/* Still trivial, but a little less so. We need to write the stencil
* values, but we don't need to mask them.
*/
@@ -1583,7 +1666,7 @@ gen_stencil_depth_test(struct spe_function *f,
*/
spe_comment(f, 0, "Running basic stencil test");
stencil_pass_reg = spe_allocate_available_register(f);
- gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+ gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
/* If two-sided stenciling is on, generate code to run the stencil
* test on the backfacing stencil as well, and combine the two results
@@ -1592,7 +1675,7 @@ gen_stencil_depth_test(struct spe_function *f,
if (dsa->stencil[1].enabled) {
unsigned int temp_reg = spe_allocate_available_register(f);
spe_comment(f, 0, "Running backface stencil test");
- gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+ gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
spe_release_register(f, temp_reg);
}
@@ -1914,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
* Z and/or stencil. We'll also convert the incoming fragment Z
* value in fragZ_reg from a floating point value in [0.0..1.0] to
* an unsigned integer value with the appropriate resolution.
+ * Note that even if depth or stencil is *not* enabled, if it's
+ * present in the buffer, we pull it out and put it back later;
+ * otherwise, we can inadvertently destroy the contents of
+ * buffers we're not supposed to touch (e.g., if the user is
+ * clearing the depth buffer but not the stencil buffer, a
+ * quad of constant depth is drawn over the surface; the stencil
+ * buffer must be maintained).
*/
switch(zs_format) {
case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
case PIPE_FORMAT_X8Z24_UNORM:
- if (dsa->depth.enabled) {
- /* We need the Z part at least */
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* four 24-bit Z values in the low-order bits */
- spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 24-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- }
- if (dsa->stencil[0].enabled) {
- setup_optional_register(f, &fbS_reg_set, &fbS_reg);
- /* four 8-bit Z values in the high-order bits */
- spe_rotmi(f, fbS_reg, fbZS_reg, -24);
- }
- break;
+ /* Pull out both Z and stencil */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+ /* four 24-bit Z values in the low-order bits */
+ spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+ /* four 8-bit stencil values in the high-order bits */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+ break;
case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
case PIPE_FORMAT_Z24X8_UNORM:
- if (dsa->depth.enabled) {
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* shift by 8 to get the upper 24-bit values */
- spe_rotmi(f, fbS_reg, fbZS_reg, -8);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 24-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- }
- if (dsa->stencil[0].enabled) {
- setup_optional_register(f, &fbS_reg_set, &fbS_reg);
- /* 8-bit stencil in the low-order bits - mask them out */
- spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
- }
- break;
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+ /* shift by 8 to get the upper 24-bit values */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+ /* 8-bit stencil in the low-order bits - mask them out */
+ spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+ break;
case PIPE_FORMAT_Z32_UNORM:
- if (dsa->depth.enabled) {
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* Copy over 4 32-bit values */
- spe_move(f, fbZ_reg, fbZS_reg);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 32-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- }
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 32-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
/* No stencil, so can't do anything there */
- break;
+ break;
case PIPE_FORMAT_Z16_UNORM:
- if (dsa->depth.enabled) {
- /* XXX Not sure this is correct, but it was here before, so we're
- * going with it for now
- */
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* Copy over 4 32-bit values */
- spe_move(f, fbZ_reg, fbZS_reg);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 16-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
- }
+ /* XXX Not sure this is correct, but it was here before, so we're
+ * going with it for now
+ */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 16-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
/* No stencil */
- break;
default:
ASSERT(0); /* invalid format */
@@ -2035,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_comment(f, 0, "Store quad's depth/stencil values in tile");
if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- if (fbS_reg_set && fbZ_reg_set) {
- spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
- }
- else if (fbS_reg_set) {
- spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
- }
- else {
- spe_move(f, fbZS_reg, fbZ_reg);
- }
+ spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
}
else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- if (fbS_reg_set && fbZ_reg_set) {
- spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
- }
- else if (fbS_reg_set) {
- spe_move(f, fbZS_reg, fbS_reg);
- }
- else {
- spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
- }
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
}
else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
- if (fbZ_reg_set) {
- spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
- }
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- if (fbZ_reg_set) {
- spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
- }
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_S8_UNORM) {
ASSERT(0); /* XXX to do */
@@ -2080,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
}
+ /* Don't need these any more */
release_optional_register(f, &fbZ_reg_set, fbZ_reg);
release_optional_register(f, &fbS_reg_set, fbS_reg);
}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index b633880c25..c93958a9ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -30,7 +30,6 @@
#include <libspe2.h>
-#include <libmisc.h>
#include <pthread.h>
#include "cell/common.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9ac2f3bbb9..ae88d06912 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -28,6 +28,7 @@
* Authors:
* Keith Whitwell <keith@tungstengraphics.com>
* Michel Dänzer <michel@tungstengraphics.com>
+ * Brian Paul
*/
#include "pipe/p_context.h"
@@ -42,10 +43,9 @@
#include "cell_texture.h"
-/* Simple, maximally packed layout.
- */
-static unsigned minify( unsigned d )
+static unsigned
+minify(unsigned d)
{
return MAX2(1, d>>1);
}
@@ -212,6 +212,89 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
/**
+ * For Cell. Basically, rearrange the pixels/quads from this layout:
+ * +--+--+--+--+
+ * |p0|p1|p2|p3|....
+ * +--+--+--+--+
+ *
+ * to this layout:
+ * +--+--+
+ * |p0|p1|....
+ * +--+--+
+ * |p2|p3|
+ * +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+ int y, x;
+
+ for (y = 0; y < TILE_SIZE; y+=2) {
+ for (x = 0; x < TILE_SIZE; x+=2) {
+ int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+ tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+ tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+ tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+ tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+ }
+ }
+}
+
+
+/**
+ * Convert image from tiled layout to linear layout. 4-byte pixels.
+ */
+static void
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+ uint dst_stride, const uint *src)
+{
+ const uint tile_size2 = tile_size * tile_size;
+ const uint h_t = (h + tile_size - 1) / tile_size;
+ const uint w_t = (w + tile_size - 1) / tile_size;
+ uint *tile_buf;
+ uint it, jt; /* tile counters */
+ uint i, j; /* intra-tile counters */
+
+ dst_stride /= 4; /* convert from bytes to pixels */
+
+ tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+
+ /* loop over src tiles */
+ for (it = 0; it < h_t; it++) {
+ for (jt = 0; jt < w_t; jt++) {
+ /* start of src tile: */
+ const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+
+ twiddle_tile(tsrc, tile_buf);
+ tsrc = tile_buf;
+
+ /* compute size of this tile (may be smaller than tile_size) */
+ /* XXX note: a compiler bug was found here. That's why the code
+ * looks as it does.
+ */
+ uint tile_width = w - jt * tile_size;
+ tile_width = MIN2(tile_width, tile_size);
+ uint tile_height = h - it * tile_size;
+ tile_height = MIN2(tile_height, tile_size);
+
+ /* loop over texels in the tile */
+ for (i = 0; i < tile_height; i++) {
+ for (j = 0; j < tile_width; j++) {
+ uint dsti = it * tile_size + i;
+ uint dstj = jt * tile_size + j;
+ ASSERT(dsti < h);
+ ASSERT(dstj < w);
+ dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
+ }
+ }
+ }
+ }
+
+ align_free(tile_buf);
+}
+
+
+/**
* Convert linear texture image data to tiled format for SPU usage.
*/
static void
@@ -230,6 +313,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
switch (ct->base.format) {
case PIPE_FORMAT_A8R8G8B8_UNORM:
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
{
int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
int offset = bufWidth * bufHeight * 4 * surface->face;
@@ -261,6 +345,51 @@ cell_twiddle_texture(struct pipe_screen *screen,
}
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+ struct pipe_surface *surface)
+{
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+ const uint texWidth = ct->base.width[level];
+ const uint texHeight = ct->base.height[level];
+ const void *map = pipe_buffer_map(screen, surface->buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+ switch (ct->base.format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ {
+ int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+ int offset = surface->stride * texHeight * 4 * surface->face;
+ uint *dst;
+
+ if (!ct->untiled_data[level]) {
+ ct->untiled_data[level] =
+ align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+ }
+
+ dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
+
+ untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+ surface->stride, src);
+ }
+ break;
+ default:
+ {
+ ct->untiled_data[level] = NULL;
+ printf("Cell: untwiddle unsupported texture format\n");
+ }
+ }
+
+ pipe_buffer_unmap(screen, surface->buffer);
+}
+
+
static struct pipe_surface *
cell_get_tex_surface(struct pipe_screen *screen,
struct pipe_texture *pt,
@@ -294,13 +423,18 @@ cell_get_tex_surface(struct pipe_screen *screen,
ps->zslice = zslice;
if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
- ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
- ps->nblocksy *
- ps->stride;
+ ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+ ps->nblocksy *
+ ps->stride;
}
else {
- assert(face == 0);
- assert(zslice == 0);
+ assert(face == 0);
+ assert(zslice == 0);
+ }
+
+ if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+ /* convert from tiled to linear layout */
+ cell_untwiddle_texture(screen, ps);
}
}
return ps;
@@ -311,6 +445,15 @@ static void
cell_tex_surface_release(struct pipe_screen *screen,
struct pipe_surface **s)
{
+ struct cell_texture *ct = cell_texture((*s)->texture);
+ const uint level = (*s)->level;
+
+ if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+ {
+ align_free(ct->untiled_data[level]);
+ ct->untiled_data[level] = NULL;
+ }
+
/* XXX if done rendering to teximage, re-tile */
pipe_texture_reference(&(*s)->texture, NULL);
@@ -325,6 +468,10 @@ cell_surface_map(struct pipe_screen *screen,
unsigned flags)
{
ubyte *map;
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+
+ assert(ct);
if (flags & ~surface->usage) {
assert(0);
@@ -335,7 +482,14 @@ cell_surface_map(struct pipe_screen *screen,
if (map == NULL)
return NULL;
else
- return (void *) (map + surface->offset);
+ {
+ if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
+ return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+ }
+ else {
+ return (void *) (map + surface->offset);
+ }
+ }
}
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 2f5fe0dd1b..7018b0c9bf 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -52,6 +52,7 @@ struct cell_texture
struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
/** Mapped, tiled texture data */
void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+ void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
};
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a6ed29ea63..d726622d94 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd)
CELL_FENCE_SIGNALLED};
uint *dst = (uint *) fence_cmd->fence;
dst += 4 * spu.init.id; /* main store/memory address, not local store */
-
+ ASSERT_ALIGN16(dst);
mfc_put((void *) &status, /* src in local memory */
(unsigned int) dst, /* dst in main memory */
sizeof(status), /* size */
@@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
}
}
- spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
- spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+ spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
}
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 3534b35000..ff3d609d25 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -38,7 +38,9 @@
#include <math.h>
#include <cos14_v.h>
#include <sin14_v.h>
-#include <transpose_matrix4x4.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
#include "cell/common.h"
#include "spu_main.h"
@@ -68,37 +70,19 @@ spu_sin(vector float x)
static vector float
spu_pow(vector float x, vector float y)
{
- float z0 = powf(spu_extract(x,0), spu_extract(y,0));
- float z1 = powf(spu_extract(x,1), spu_extract(y,1));
- float z2 = powf(spu_extract(x,2), spu_extract(y,2));
- float z3 = powf(spu_extract(x,3), spu_extract(y,3));
- return (vector float) {z0, z1, z2, z3};
+ return _powf4(x, y);
}
static vector float
spu_exp2(vector float x)
{
- float z0 = powf(2.0f, spu_extract(x,0));
- float z1 = powf(2.0f, spu_extract(x,1));
- float z2 = powf(2.0f, spu_extract(x,2));
- float z3 = powf(2.0f, spu_extract(x,3));
- return (vector float) {z0, z1, z2, z3};
+ return _exp2f4(x);
}
static vector float
spu_log2(vector float x)
{
- /*
- * log_base_2(x) = log(x) / log(2)
- * 1.442695 = 1/log(2).
- */
- static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
- float z0 = logf(spu_extract(x,0));
- float z1 = logf(spu_extract(x,1));
- float z2 = logf(spu_extract(x,2));
- float z3 = logf(spu_extract(x,3));
- vector float v = (vector float) {z0, z1, z2, z3};
- return spu_mul(v, k);
+ return _log2f4(x);
}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 668af10be2..692790c9f3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -160,8 +160,7 @@ struct spu_global
tile_t ztile ALIGN16_ATTRIB;
/** Read depth/stencil tiles? */
- boolean read_depth;
- boolean read_stencil;
+ boolean read_depth_stencil;
/** Current tiles' status */
ubyte cur_ctile_status, cur_ztile_status;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5515bb55c9..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
static INLINE void
get_cz_tiles(uint tx, uint ty)
{
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
//printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
wait_put_cz_tiles(void)
{
wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
wait_on_mask(1 << TAG_WRITE_TILE_Z);
}
}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 4caf7d6b61..5f908159bb 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -369,7 +369,7 @@ flush_spans(void)
}
ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
/* wait for mfc_get() to complete */
//printf("SPU: %u: waiting for ztile\n", spu.init.id);
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 0111469405..31908a517b 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
#include "tgsi/tgsi_sse2.h"
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
#include "rtasm/rtasm_x86sse.h"
@@ -92,7 +92,8 @@ fs_sse_run( const struct sp_fragment_shader *base,
machine->Temps);
/* init kill mask */
- machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] = 0x0;
+ tgsi_set_kill_mask(machine, 0x0);
+ tgsi_set_exec_mask(machine, 1, 1, 1, 1);
shader->func( machine->Inputs,
machine->Outputs,
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index d05e12d1d9..b7aac7f84a 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -64,6 +64,14 @@ output_quad(struct quad_stage *qs, struct quad_header *quad)
for (i = 0; i < 4; i++) { /* loop over color chans */
tile->data.color[y][x][i] = quadColor[i][j];
}
+ if (0) {
+ debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
+ quad->input.x0 + x,
+ quad->input.y0 + y,
+ quadColor[0][j],
+ quadColor[1][j],
+ quadColor[2][j]);
+ }
}
}
}
diff --git a/src/gallium/include/pipe/p_debug.h b/src/gallium/include/pipe/p_debug.h
index cb6196aa9f..3b00fb9aa8 100644
--- a/src/gallium/include/pipe/p_debug.h
+++ b/src/gallium/include/pipe/p_debug.h
@@ -49,7 +49,7 @@ extern "C" {
#endif
-#ifdef DBG
+#if defined(DBG) || defined(DEBUG)
#ifndef DEBUG
#define DEBUG 1
#endif
diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
index d70de8e301..5e79b7f485 100644
--- a/src/gallium/include/pipe/p_inlines.h
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -82,11 +82,14 @@ static INLINE void
pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
{
/* bump the refcount first */
- if (surf)
+ if (surf) {
+ assert(surf->refcount);
surf->refcount++;
+ }
if (*ptr) {
-
+ assert((*ptr)->refcount);
+
/* There are currently two sorts of surfaces... This needs to be
* fixed so that all surfaces are views into a texture.
*/
@@ -113,11 +116,16 @@ winsys_buffer_reference(struct pipe_winsys *winsys,
struct pipe_buffer **ptr,
struct pipe_buffer *buf)
{
- if (buf)
+ if (buf) {
+ assert(buf->refcount);
buf->refcount++;
+ }
- if (*ptr && --(*ptr)->refcount == 0)
- winsys->buffer_destroy( winsys, *ptr );
+ if (*ptr) {
+ assert((*ptr)->refcount);
+ if(--(*ptr)->refcount == 0)
+ winsys->buffer_destroy( winsys, *ptr );
+ }
*ptr = buf;
}
@@ -133,12 +141,15 @@ pipe_texture_reference(struct pipe_texture **ptr,
{
assert(ptr);
- if (pt)
+ if (pt) {
+ assert(pt->refcount);
pt->refcount++;
+ }
if (*ptr) {
struct pipe_screen *screen = (*ptr)->screen;
assert(screen);
+ assert((*ptr)->refcount);
screen->texture_release(screen, ptr);
assert(!*ptr);
@@ -154,6 +165,7 @@ pipe_texture_release(struct pipe_texture **ptr)
struct pipe_screen *screen;
assert(ptr);
screen = (*ptr)->screen;
+ assert((*ptr)->refcount);
screen->texture_release(screen, ptr);
*ptr = NULL;
}
@@ -176,12 +188,6 @@ pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
return screen->winsys->user_buffer_create(screen->winsys, ptr, size);
}
-static INLINE void
-pipe_buffer_destroy( struct pipe_screen *screen, struct pipe_buffer *buf )
-{
- screen->winsys->buffer_destroy(screen->winsys, buf);
-}
-
static INLINE void *
pipe_buffer_map(struct pipe_screen *screen,
struct pipe_buffer *buf,